AML Mini-Challenge - Credit Card Affinity Modelling

Author

Dominik Filliger & Noah Leuenberger (2024)

import random
from collections import OrderedDict
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

sns.set_theme()
# plt.style.use('seaborn-white')
# plt.style.use('ggplot')

data_reduction = OrderedDict()

SEED = 1337

def seed_everything(seed):
    np.random.seed(seed)
    random.seed(seed)
    
seed_everything(SEED)

1 Data Import & Wrangling

1.1 Helper Functions

def remap_values(df, column, mapping):
    # assert that all values in the column are in the mapping except for NaN
    assert df[column].dropna().isin(mapping.keys()).all()

    df[column] = df[column].map(mapping, na_action="ignore")
    return df


def map_empty_to_nan(df, column):
    if df[column].dtype != "object":
        return df

    df[column] = df[column].replace(r"^\s*$", np.nan, regex=True)
    return df


def read_csv(file_path, sep=";", dtypes=None):
    df = pd.read_csv(file_path, sep=sep, dtype=dtypes)

    for col in df.columns:
        df = map_empty_to_nan(df, col)

    return df
def plot_categorical_variables(df, categorical_columns, fill_na_value="NA"):
    """
    Plots count plots for categorical variables in a DataFrame, filling NA values with a specified string.

    Parameters:
    - df: pandas.DataFrame containing the data.
    - categorical_vars: list of strings, names of the categorical variables in df to plot.
    - fill_na_value: string, the value to use for filling NA values in the categorical variables.
    """
    # Fill NA values in the specified categorical variables
    for var in categorical_columns:
        if df[var].isna().any():
            df[var] = df[var].fillna(fill_na_value)

    total = float(len(df))
    fig, axes = plt.subplots(
        nrows=len(categorical_columns), figsize=(8, len(categorical_columns) * 5)
    )

    if len(categorical_columns) == 1:  # If there's only one categorical variable, wrap axes in a list
        axes = [axes]

    for i, var in enumerate(categorical_columns):
        ax = sns.countplot(
            x=var, data=df, ax=axes[i], order=df[var].value_counts().index
        )

        axes[i].set_title(f"Distribution of {var}")
        axes[i].set_ylabel("Count")
        axes[i].set_xlabel(var)
        # if the number is more thatn 6 rotate the x labels
        if len(df[var].value_counts()) > 6:
            ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha="right")

        for p in ax.patches:
            height = p.get_height()
            ax.text(
                p.get_x() + p.get_width() / 2.0,
                height + 3,
                "{:1.2f}%".format((height / total) * 100),
                ha="center",
            )

    plt.tight_layout()
    plt.show()


def plot_numerical_distributions(df, numerical_columns, kde=True, bins=30):
    """
    Plots the distribution of all numerical variables in a DataFrame.

    Parameters:
    - df: pandas.DataFrame containing the data.
    """

    # Determine the number of rows needed for subplots based on the number of numerical variables
    nrows = len(numerical_columns)

    # Create subplots
    fig, axes = plt.subplots(nrows=nrows, ncols=1, figsize=(8, 5 * nrows))

    if nrows == 1:  # If there's only one numerical variable, wrap axes in a list
        axes = [axes]

    for i, var in enumerate(numerical_columns):
        sns.histplot(df[var], ax=axes[i], kde=kde, bins=bins)
        axes[i].set_title(f"Distribution of {var}")
        axes[i].set_xlabel(var)
        axes[i].set_ylabel("Frequency")

    plt.tight_layout()
    plt.show()


def plot_date_monthly_counts(df, date_column, title):
    """
    Plots the monthly counts of a date column in a DataFrame.

    Parameters:
    - df: pandas.DataFrame containing the data.
    - date_column: string, name of the date column in df to plot.
    - title: string, title of the plot.
    """
    df[date_column] = pd.to_datetime(df[date_column])
    df["month"] = df[date_column].dt.to_period("M")

    monthly_counts = df["month"].value_counts().sort_index()
    monthly_counts.plot(kind="bar")
    plt.title(title)
    plt.xlabel("Month")
    plt.ylabel("Count")
    plt.show()


def add_percentage_labels(ax, hue_order):
    for p in ax.patches:
        height = p.get_height()
        width = p.get_width()
        x = p.get_x()
        y = p.get_y()
        label_text = f"{height:.1f}%"
        label_x = x + width / 2
        label_y = y + height / 2
        ax.text(
            label_x,
            label_y,
            label_text,
            ha="center",
            va="center",
            fontsize=9,
            color="white",
            weight="bold"
        )

1.2 Entities

1.2.1 Accounts

accounts_df = read_csv("data/account.csv")

# Translated frequency from Czech to English
# according to https://sorry.vse.cz/~berka/challenge/PAST/index.html
accounts_df = remap_values(
    accounts_df,
    "frequency",
    {
        "POPLATEK MESICNE": "MONTHLY_ISSUANCE",
        "POPLATEK TYDNE": "WEEKLY_ISSUANCE",
        "POPLATEK PO OBRATU": "ISSUANCE_AFTER_TRANSACTION",
    },
)

accounts_df["date"] = pd.to_datetime(accounts_df["date"], format="%y%m%d")

accounts_df.rename(
    columns={"date": "account_created", "frequency": "account_frequency"}, inplace=True
)

data_reduction["Total number of accounts"] = len(accounts_df)
accounts_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4500 entries, 0 to 4499
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   account_id         4500 non-null   int64         
 1   district_id        4500 non-null   int64         
 2   account_frequency  4500 non-null   object        
 3   account_created    4500 non-null   datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 140.8+ KB
# todo add some basic eda here
accounts_df.head()
account_id district_id account_frequency account_created
0 576 55 MONTHLY_ISSUANCE 1993-01-01
1 3818 74 MONTHLY_ISSUANCE 1993-01-01
2 704 55 MONTHLY_ISSUANCE 1993-01-01
3 2378 16 MONTHLY_ISSUANCE 1993-01-01
4 2632 24 MONTHLY_ISSUANCE 1993-01-02
accounts_df.nunique()
account_id           4500
district_id            77
account_frequency       3
account_created      1535
dtype: int64
plot_categorical_variables(accounts_df, ["account_frequency"])

plot_numerical_distributions(accounts_df, ["account_created"])

1.2.2 Clients

clients_df = read_csv("data/client.csv")


def parse_birth_number(birth_number):
    birth_number_str = str(birth_number)

    # Extract year, month, and day from birth number from string
    # according to https://sorry.vse.cz/~berka/challenge/PAST/index.html
    year = int(birth_number_str[:2])
    month = int(birth_number_str[2:4])
    day = int(birth_number_str[4:6])

    # Determine sex based on month and adjust month for female clients
    # according to https://sorry.vse.cz/~berka/challenge/PAST/index.html
    if month > 50:
        sex = "Female"
        month -= 50
    else:
        sex = "Male"

    # Validate date
    assert 1 <= month <= 12
    assert 1 <= day <= 31
    assert 0 <= year <= 99

    if month in [4, 6, 9, 11]:
        assert 1 <= day <= 30
    elif month == 2:
        assert 1 <= day <= 29
    else:
        assert 1 <= day <= 31

    # Assuming all dates are in the 1900s
    birth_date = datetime(1900 + year, month, day)
    return pd.Series([sex, birth_date])


clients_df[["sex", "birth_date"]] = clients_df["birth_number"].apply(parse_birth_number)

# Calculate 'age' assuming the reference year is 1999
clients_df["age"] = clients_df["birth_date"].apply(lambda x: 1999 - x.year)

# Drop 'birth_number' column as it is no longer needed
clients_df = clients_df.drop(columns=["birth_number"])

clients_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5369 entries, 0 to 5368
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   client_id    5369 non-null   int64         
 1   district_id  5369 non-null   int64         
 2   sex          5369 non-null   object        
 3   birth_date   5369 non-null   datetime64[ns]
 4   age          5369 non-null   int64         
dtypes: datetime64[ns](1), int64(3), object(1)
memory usage: 209.9+ KB
# todo add some basic eda here
clients_df.head()
client_id district_id sex birth_date age
0 1 18 Female 1970-12-13 29
1 2 1 Male 1945-02-04 54
2 3 1 Female 1940-10-09 59
3 4 5 Male 1956-12-01 43
4 5 5 Female 1960-07-03 39
clients_df.describe()
client_id district_id birth_date age
count 5369.000000 5369.000000 5369 5369.000000
mean 3359.011920 37.310114 1953-09-12 09:32:21.143602176 45.801639
min 1.000000 1.000000 1911-08-20 00:00:00 12.000000
25% 1418.000000 14.000000 1940-11-25 00:00:00 31.000000
50% 2839.000000 38.000000 1954-05-06 00:00:00 45.000000
75% 4257.000000 60.000000 1968-06-09 00:00:00 59.000000
max 13998.000000 77.000000 1987-09-27 00:00:00 88.000000
std 2832.911984 25.043690 NaN 17.282283
plot_numerical_distributions(clients_df, ["birth_date", "age"])

1.2.3 Dispositions

dispositions_df = read_csv("data/disp.csv")
dispositions_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5369 entries, 0 to 5368
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   disp_id     5369 non-null   int64 
 1   client_id   5369 non-null   int64 
 2   account_id  5369 non-null   int64 
 3   type        5369 non-null   object
dtypes: int64(3), object(1)
memory usage: 167.9+ KB
dispositions_df.head()
disp_id client_id account_id type
0 1 1 1 OWNER
1 2 2 2 OWNER
2 3 3 2 DISPONENT
3 4 4 3 OWNER
4 5 5 3 DISPONENT
dispositions_df.describe()
disp_id client_id account_id
count 5369.000000 5369.000000 5369.000000
mean 3337.097970 3359.011920 2767.496927
std 2770.418826 2832.911984 2307.843630
min 1.000000 1.000000 1.000000
25% 1418.000000 1418.000000 1178.000000
50% 2839.000000 2839.000000 2349.000000
75% 4257.000000 4257.000000 3526.000000
max 13690.000000 13998.000000 11382.000000
plot_categorical_variables(dispositions_df, ["type"])

As the goal of this model is to address accounts and not client directly we will focus on the clients which own an account and focus solely on them.

dispositions_df = dispositions_df[dispositions_df["type"] == "OWNER"]

1.2.4 Orders

orders_df = read_csv("data/order.csv")

# Translated from Czech to English
# according to https://sorry.vse.cz/~berka/challenge/PAST/index.html
orders_df = remap_values(
    orders_df,
    "k_symbol",
    {
        "POJISTNE": "Insurance_Payment",
        "SIPO": "Household",
        "LEASING": "Leasing",
        "UVER": "Loan_Payment",
    },
)

orders_df["account_to"] = orders_df["account_to"].astype("category")

orders_df = orders_df.rename(columns={"amount": "debited_amount"})

orders_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6471 entries, 0 to 6470
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   order_id        6471 non-null   int64   
 1   account_id      6471 non-null   int64   
 2   bank_to         6471 non-null   object  
 3   account_to      6471 non-null   category
 4   debited_amount  6471 non-null   float64 
 5   k_symbol        5092 non-null   object  
dtypes: category(1), float64(1), int64(2), object(2)
memory usage: 573.9+ KB
orders_df.head()
order_id account_id bank_to account_to debited_amount k_symbol
0 29401 1 YZ 87144583 2452.0 Household
1 29402 2 ST 89597016 3372.7 Loan_Payment
2 29403 2 QR 13943797 7266.0 Household
3 29404 3 WX 83084338 1135.0 Household
4 29405 3 CD 24485939 327.0 NaN
orders_df.describe()
order_id account_id debited_amount
count 6471.000000 6471.000000 6471.000000
mean 33778.197497 2962.302890 3280.635698
std 3737.681949 2518.503228 2714.475335
min 29401.000000 1.000000 1.000000
25% 31187.500000 1223.000000 1241.500000
50% 32988.000000 2433.000000 2596.000000
75% 34785.500000 3645.500000 4613.500000
max 46338.000000 11362.000000 14882.000000
orders_df.nunique()
order_id          6471
account_id        3758
bank_to             13
account_to        6446
debited_amount    4412
k_symbol             4
dtype: int64

There appear to be as many order ids as there are rows.

plot_categorical_variables(orders_df, ["k_symbol", "bank_to"])
/tmp/ipykernel_1845/945940023.py:33: UserWarning:

set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.

plot_numerical_distributions(orders_df, ["debited_amount"])

1.2.5 Transactions

# column 8 is the 'bank' column which contains NaNs and must be read as string
transactions_df = read_csv("data/trans.csv", dtypes={8: str})

transactions_df["date"] = pd.to_datetime(transactions_df["date"], format="%y%m%d")

# Translated type, operations and characteristics from Czech to English
# according to https://sorry.vse.cz/~berka/challenge/PAST/index.html
transactions_df = remap_values(
    transactions_df,
    "type",
    {
        "VYBER": "Withdrawal",  # Also withdrawal as it is against the documentation present in the dataset
        "PRIJEM": "Credit",
        "VYDAJ": "Withdrawal",
    },
)

transactions_df = remap_values(
    transactions_df,
    "operation",
    {
        "VYBER KARTOU": "Credit Card Withdrawal",
        "VKLAD": "Credit in Cash",
        "PREVOD Z UCTU": "Collection from Another Bank",
        "VYBER": "Withdrawal in Cash",
        "PREVOD NA UCET": "Remittance to Another Bank",
    },
)

transactions_df = remap_values(
    transactions_df,
    "k_symbol",
    {
        "POJISTNE": "Insurance Payment",
        "SLUZBY": "Payment on Statement",
        "UROK": "Interest Credited",
        "SANKC. UROK": "Sanction Interest",
        "SIPO": "Household",
        "DUCHOD": "Old-age Pension",
        "UVER": "Loan Payment",
    },
)

# Set the amount to negative for withdrawals and positive for credits
transactions_df["amount"] = np.where(
    transactions_df["type"] == "Credit",
    transactions_df["amount"],
    -transactions_df["amount"],
)

transactions_df.rename(columns={"type": "transaction_type"}, inplace=True)

transactions_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1056320 entries, 0 to 1056319
Data columns (total 10 columns):
 #   Column            Non-Null Count    Dtype         
---  ------            --------------    -----         
 0   trans_id          1056320 non-null  int64         
 1   account_id        1056320 non-null  int64         
 2   date              1056320 non-null  datetime64[ns]
 3   transaction_type  1056320 non-null  object        
 4   operation         873206 non-null   object        
 5   amount            1056320 non-null  float64       
 6   balance           1056320 non-null  float64       
 7   k_symbol          521006 non-null   object        
 8   bank              273508 non-null   object        
 9   account           295389 non-null   float64       
dtypes: datetime64[ns](1), float64(3), int64(2), object(4)
memory usage: 80.6+ MB
transactions_df.head()
trans_id account_id date transaction_type operation amount balance k_symbol bank account
0 695247 2378 1993-01-01 Credit Credit in Cash 700.0 700.0 NaN NaN NaN
1 171812 576 1993-01-01 Credit Credit in Cash 900.0 900.0 NaN NaN NaN
2 207264 704 1993-01-01 Credit Credit in Cash 1000.0 1000.0 NaN NaN NaN
3 1117247 3818 1993-01-01 Credit Credit in Cash 600.0 600.0 NaN NaN NaN
4 579373 1972 1993-01-02 Credit Credit in Cash 400.0 400.0 NaN NaN NaN
transactions_df.describe()
trans_id account_id date amount balance account
count 1.056320e+06 1.056320e+06 1056320 1.056320e+06 1.056320e+06 2.953890e+05
mean 1.335311e+06 2.936867e+03 1997-01-04 07:29:27.037261952 1.866397e+02 3.851833e+04 4.567092e+07
min 1.000000e+00 1.000000e+00 1993-01-01 00:00:00 -8.740000e+04 -4.112570e+04 0.000000e+00
25% 4.302628e+05 1.204000e+03 1996-01-16 00:00:00 -3.019000e+03 2.240250e+04 1.782858e+07
50% 8.585065e+05 2.434000e+03 1997-04-10 00:00:00 -1.460000e+01 3.314340e+04 4.575095e+07
75% 2.060979e+06 3.660000e+03 1998-02-28 00:00:00 2.000000e+02 4.960362e+04 7.201341e+07
max 3.682987e+06 1.138200e+04 1998-12-31 00:00:00 7.481200e+04 2.096370e+05 9.999420e+07
std 1.227487e+06 2.477345e+03 NaN 1.121353e+04 2.211787e+04 3.066340e+07
plot_categorical_variables(
    transactions_df, ["transaction_type", "operation", "k_symbol"]
)
/tmp/ipykernel_1845/945940023.py:33: UserWarning:

set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.

plot_numerical_distributions(transactions_df, ["date", "amount", "balance"])

Looking at the distributions of the transaction table we can see that the count of transactions per year increase over time. So we can conclude that the bank has a growing client base.

However, the other plots are not very useful. For one the transaction amount seems to be very sparse, ranging from values between -80000 and 80000.

The balance distribution also showcases that there are accounts with a negative balance after a transaction, which would only make sense if debt is also included in this value.

According to description of the field balance: “balance after transaction”

1.2.5.1 Transaction Amounts and Counts by Month

# Getting a list of unique years from the dataset
transactions_df["year"] = transactions_df["date"].dt.year
transactions_df["month"] = transactions_df["date"].dt.month

months = [
    "Jan",
    "Feb",
    "Mar",
    "Apr",
    "May",
    "Jun",
    "Jul",
    "Aug",
    "Sep",
    "Oct",
    "Nov",
    "Dec",
]
years = sorted(transactions_df["year"].unique())

fig, axs = plt.subplots(
    len(years) * 2,
    1,
    figsize=(8, 6 * len(years)),
    sharex=True,
    gridspec_kw={"height_ratios": [3, 1] * len(years)},
)

for i, year in enumerate(years):
    # Filter transactions for the current year
    yearly_transactions = transactions_df[transactions_df["year"] == year]

    # Preparing data for the box plot: a list of amounts for each month for the current year
    amounts_per_month_yearly = [
        yearly_transactions[yearly_transactions["month"] == month]["amount"]
        for month in range(1, 13)
    ]

    # Preparing data for the bar chart for the current year
    monthly_summary_yearly = (
        yearly_transactions.groupby("month")
        .agg(TotalAmount=("amount", "sum"), TransactionCount=("amount", "count"))
        .reset_index()
    )

    # Box plot for transaction amounts by month for the current year
    axs[i * 2].boxplot(amounts_per_month_yearly, patch_artist=True)
    # now with seaborn
    # sns.boxplot(data=yearly_transactions, x='month', y='amount', ax=axs[i*2])
    axs[i * 2].set_title(f"Transaction Amounts Per Month in {year} (Box Plot)")
    axs[i * 2].set_yscale("symlog")
    axs[i * 2].set_ylabel("Transaction Amounts (log scale)")
    axs[i * 2].grid(True, which="both")

    # Bar chart for transaction count by month for the current year
    axs[i * 2 + 1].bar(
        monthly_summary_yearly["month"],
        monthly_summary_yearly["TransactionCount"],
        color="tab:red",
        alpha=0.6,
    )
    axs[i * 2 + 1].set_ylabel("Transaction Count")
    axs[i * 2 + 1].grid(True, which="both")

# Setting x-ticks and labels for the last bar chart (shared x-axis for all)
axs[-1].set_xticks(range(1, 13))
axs[-1].set_xticklabels(months)
axs[-1].set_xlabel("Month")

plt.tight_layout()
plt.show()

fig, axs = plt.subplots(
    2,
    len(years),
    figsize=(8 * len(years) / 2, 7),
    sharey="row",
    gridspec_kw={"height_ratios": [3, 1]},
)

for i, year in enumerate(years):
    # Filter transactions for the current year
    yearly_transactions = transactions_df[transactions_df["year"] == year]

    # Preparing data for the box plot: a list of amounts for each month for the current year
    amounts_per_month_yearly = [
        yearly_transactions[yearly_transactions["month"] == month]["amount"]
        for month in range(1, 13)
    ]

    # Preparing data for the bar chart for the current year
    monthly_summary_yearly = (
        yearly_transactions.groupby("month")
        .agg(TotalAmount=("amount", "sum"), TransactionCount=("amount", "count"))
        .reset_index()
    )

    # Selecting the appropriate axes for multiple or single year scenarios
    ax_box = axs[0, i] if len(years) > 1 else axs[0]
    ax_bar = axs[1, i] if len(years) > 1 else axs[1]

    ax_box.boxplot(amounts_per_month_yearly, patch_artist=True)
    ax_box.set_title(f"{year} (Box Plot)")
    ax_box.set_yscale("symlog")
    ax_box.set_ylabel("Transaction Amounts (log scale)")
    ax_box.grid(True, which="both")

    ax_bar.bar(
        monthly_summary_yearly["month"],
        monthly_summary_yearly["TransactionCount"],
        color="tab:red",
        alpha=0.6,
    )
    ax_bar.set_ylabel("Transaction Count")
    ax_bar.grid(True, which="both")

    # Setting common x-ticks and labels for all axes
    ax_bar.set_xticks(range(1, 13))
    ax_bar.set_xticklabels(months)

fig.text(0.5, 0.04, "Month", ha="center")
plt.tight_layout()
plt.show()

1.2.5.2 Negative Balances

negative_balances = transactions_df[transactions_df["balance"] < 0]
plot_numerical_distributions(negative_balances, ["balance", "amount"])
print(f"Number of transactions with negative balance: {len(negative_balances)}")

Number of transactions with negative balance: 2999

There appear to be 2999 transactions which have a negative balance, therefore after the transaction the account balance was negative. This implies that these accounts are in some kind of debt.

1.2.6 Loans

loans_df = read_csv("data/loan.csv")

loans_df["date"] = pd.to_datetime(loans_df["date"], format="%y%m%d")

loans_df["status"] = loans_df["status"].map(
    {
        "A": "Contract finished, no problems",
        "B": "Contract finished, loan not paid",
        "C": "Contract running, OK thus-far",
        "D": "Contract running, client in debt",
    }
)

loans_df.rename(
    columns={
        "date": "granted_date",
        "amount": "amount",
        "duration": "duration",
        "payments": "monthly_payments",
        "status": "status",
    },
    inplace=True,
)

loans_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 682 entries, 0 to 681
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   loan_id           682 non-null    int64         
 1   account_id        682 non-null    int64         
 2   granted_date      682 non-null    datetime64[ns]
 3   amount            682 non-null    int64         
 4   duration          682 non-null    int64         
 5   monthly_payments  682 non-null    float64       
 6   status            682 non-null    object        
dtypes: datetime64[ns](1), float64(1), int64(4), object(1)
memory usage: 37.4+ KB
# todo add some basic eda here
loans_df.head()
loan_id account_id granted_date amount duration monthly_payments status
0 5314 1787 1993-07-05 96396 12 8033.0 Contract finished, loan not paid
1 5316 1801 1993-07-11 165960 36 4610.0 Contract finished, no problems
2 6863 9188 1993-07-28 127080 60 2118.0 Contract finished, no problems
3 5325 1843 1993-08-03 105804 36 2939.0 Contract finished, no problems
4 7240 11013 1993-09-06 274740 60 4579.0 Contract finished, no problems
loans_df.describe()
loan_id account_id granted_date amount duration monthly_payments
count 682.000000 682.000000 682 682.000000 682.000000 682.000000
mean 6172.466276 5824.162757 1996-09-29 05:35:43.108504448 151410.175953 36.492669 4190.664223
min 4959.000000 2.000000 1993-07-05 00:00:00 4980.000000 12.000000 304.000000
25% 5577.500000 2967.000000 1995-07-04 12:00:00 66732.000000 24.000000 2477.000000
50% 6176.500000 5738.500000 1997-02-06 12:00:00 116928.000000 36.000000 3934.000000
75% 6752.500000 8686.000000 1997-12-12 12:00:00 210654.000000 48.000000 5813.500000
max 7308.000000 11362.000000 1998-12-08 00:00:00 590820.000000 60.000000 9910.000000
std 682.579279 3283.512681 NaN 113372.406310 17.075219 2215.830344
loans_df.nunique()
loan_id             682
account_id          682
granted_date        559
amount              645
duration              5
monthly_payments    577
status                4
dtype: int64

It seems as if one account can have at max one loan.

plot_categorical_variables(loans_df, ["duration", "status"])

The distribution of durations seems to be even.

plot_numerical_distributions(loans_df, ["granted_date"])

1.2.7 Credit Cards

cards_df = read_csv("data/card.csv")

cards_df["issued"] = pd.to_datetime(
    cards_df["issued"], format="%y%m%d %H:%M:%S"
).dt.date

cards_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 892 entries, 0 to 891
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   card_id  892 non-null    int64 
 1   disp_id  892 non-null    int64 
 2   type     892 non-null    object
 3   issued   892 non-null    object
dtypes: int64(2), object(2)
memory usage: 28.0+ KB
cards_df.head()
card_id disp_id type issued
0 1005 9285 classic 1993-11-07
1 104 588 classic 1994-01-19
2 747 4915 classic 1994-02-05
3 70 439 classic 1994-02-08
4 577 3687 classic 1994-02-15
cards_df.describe()
card_id disp_id
count 892.000000 892.000000
mean 480.855381 3511.862108
std 306.933982 2984.373626
min 1.000000 9.000000
25% 229.750000 1387.000000
50% 456.500000 2938.500000
75% 684.250000 4459.500000
max 1247.000000 13660.000000
plot_categorical_variables(cards_df, ["type"])

plot_numerical_distributions(cards_df, ["issued"])

1.2.8 Demographic data

districts_df = read_csv("data/district.csv")

# Rename columns
# according to https://sorry.vse.cz/~berka/challenge/PAST/index.html
districts_df.rename(
    columns={
        "A1": "district_id",
        "A2": "district_name",
        "A3": "region",
        "A4": "inhabitants",
        "A5": "small_municipalities",
        "A6": "medium_municipalities",
        "A7": "large_municipalities",
        "A8": "huge_municipalities",
        "A9": "cities",
        "A10": "ratio_urban_inhabitants",
        "A11": "average_salary",
        "A12": "unemployment_rate_1995",
        "A13": "unemployment_rate_1996",
        "A14": "entrepreneurs_per_1000_inhabitants",
        "A15": "crimes_committed_1995",
        "A16": "crimes_committed_1996",
    },
    inplace=True,
)

for col in [
    "unemployment_rate_1995",
    "unemployment_rate_1996",
    "crimes_committed_1995",
    "crimes_committed_1996",
]:
    districts_df[col] = pd.to_numeric(districts_df[col], errors="coerce")

districts_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 16 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   district_id                         77 non-null     int64  
 1   district_name                       77 non-null     object 
 2   region                              77 non-null     object 
 3   inhabitants                         77 non-null     int64  
 4   small_municipalities                77 non-null     int64  
 5   medium_municipalities               77 non-null     int64  
 6   large_municipalities                77 non-null     int64  
 7   huge_municipalities                 77 non-null     int64  
 8   cities                              77 non-null     int64  
 9   ratio_urban_inhabitants             77 non-null     float64
 10  average_salary                      77 non-null     int64  
 11  unemployment_rate_1995              76 non-null     float64
 12  unemployment_rate_1996              77 non-null     float64
 13  entrepreneurs_per_1000_inhabitants  77 non-null     int64  
 14  crimes_committed_1995               76 non-null     float64
 15  crimes_committed_1996               77 non-null     int64  
dtypes: float64(4), int64(10), object(2)
memory usage: 9.8+ KB

It appears as if there is 1 null value for unemployment rate in 1995 and crimes committed in 1995.

# todo add some basic eda here
districts_df.head()
district_id district_name region inhabitants small_municipalities medium_municipalities large_municipalities huge_municipalities cities ratio_urban_inhabitants average_salary unemployment_rate_1995 unemployment_rate_1996 entrepreneurs_per_1000_inhabitants crimes_committed_1995 crimes_committed_1996
0 1 Hl.m. Praha Prague 1204953 0 0 0 1 1 100.0 12541 0.29 0.43 167 85677.0 99107
1 2 Benesov central Bohemia 88884 80 26 6 2 5 46.7 8507 1.67 1.85 132 2159.0 2674
2 3 Beroun central Bohemia 75232 55 26 4 1 5 41.7 8980 1.95 2.21 111 2824.0 2813
3 4 Kladno central Bohemia 149893 63 29 6 2 6 67.4 9753 4.64 5.05 109 5244.0 5892
4 5 Kolin central Bohemia 95616 65 30 4 1 6 51.4 9307 3.85 4.43 118 2616.0 3040
districts_df.describe()
district_id inhabitants small_municipalities medium_municipalities large_municipalities huge_municipalities cities ratio_urban_inhabitants average_salary unemployment_rate_1995 unemployment_rate_1996 entrepreneurs_per_1000_inhabitants crimes_committed_1995 crimes_committed_1996
count 77.000000 7.700000e+01 77.000000 77.000000 77.000000 77.000000 77.000000 77.000000 77.000000 76.000000 77.000000 77.000000 76.000000 77.000000
mean 39.000000 1.338849e+05 48.623377 24.324675 6.272727 1.727273 6.259740 63.035065 9031.675325 3.119342 3.787013 116.129870 4850.315789 5030.831169
std 22.371857 1.369135e+05 32.741829 12.780991 4.015222 1.008338 2.435497 16.221727 790.202347 1.665568 1.908480 16.608773 9888.951933 11270.796786
min 1.000000 4.282100e+04 0.000000 0.000000 0.000000 0.000000 1.000000 33.900000 8110.000000 0.290000 0.430000 81.000000 818.000000 888.000000
25% 20.000000 8.585200e+04 22.000000 16.000000 4.000000 1.000000 5.000000 51.900000 8512.000000 1.787500 2.310000 105.000000 2029.750000 2122.000000
50% 39.000000 1.088710e+05 49.000000 25.000000 6.000000 2.000000 6.000000 59.800000 8814.000000 2.825000 3.600000 113.000000 2932.000000 3040.000000
75% 58.000000 1.390120e+05 71.000000 32.000000 8.000000 2.000000 8.000000 73.500000 9317.000000 3.890000 4.790000 126.000000 4525.500000 4595.000000
max 77.000000 1.204953e+06 151.000000 70.000000 20.000000 5.000000 11.000000 100.000000 12541.000000 7.340000 9.400000 167.000000 85677.000000 99107.000000
districts_df.nunique()
district_id                           77
district_name                         77
region                                 8
inhabitants                           77
small_municipalities                  53
medium_municipalities                 36
large_municipalities                  17
huge_municipalities                    6
cities                                11
ratio_urban_inhabitants               70
average_salary                        76
unemployment_rate_1995                70
unemployment_rate_1996                73
entrepreneurs_per_1000_inhabitants    44
crimes_committed_1995                 75
crimes_committed_1996                 76
dtype: int64
plot_numerical_distributions(districts_df, ["crimes_committed_1995"])

plot_categorical_variables(districts_df, ["region"])
/tmp/ipykernel_1845/945940023.py:33: UserWarning:

set_ticklabels() should only be used with a fixed number of ticks, i.e. after set_ticks() or using a FixedLocator.

We need to differentiate between the domicile of the client and account, as they can be different.

1.3 Data Relationships

Following the documentation of the dataset, there are multiple relationships that need to be validated. https://sorry.vse.cz/~berka/challenge/PAST/index.html

The ERD according to the descriptions on https://sorry.vse.cz/~berka/challenge/PAST/index.html

This ERD shows how the data appears in the dataset:

In order to also validate the relationships from a algorithmic perspective, we can use the following code:

# Verify 1:1 relationships between CLIENT, LOAN and DISPOSITION
assert dispositions_df[
    "client_id"
].is_unique, "Each client_id should appear exactly once in the DISPOSITION DataFrame."
assert loans_df[
    "account_id"
].is_unique, "Each account_id should appear exactly once in the LOAN DataFrame."

# Verify 1:M relationships between ACCOUNT and DISPOSITION
# assert dispositions['account_id'].is_unique == False, "An account_id should appear more than once in the DISPOSITION DataFrame."
assert (
    dispositions_df["account_id"].is_unique == True
), "An account_id should appear once in the DISPOSITION DataFrame."
# TODO check if in accordance to decision to remove disponents from dispositions

# Verify each district_id in ACCOUNT and CLIENT exists in DISTRICT
assert set(accounts_df["district_id"]).issubset(
    set(districts_df["district_id"])
), "All district_ids in ACCOUNT should exist in DISTRICT."
assert set(clients_df["district_id"]).issubset(
    set(districts_df["district_id"])
), "All district_ids in CLIENT should exist in DISTRICT."

# Verify each account_id in DISPOSITION, ORDER, TRANSACTION, and LOAN exists in ACCOUNT
assert set(dispositions_df["account_id"]).issubset(
    set(accounts_df["account_id"])
), "All account_ids in DISPOSITION should exist in ACCOUNT."
assert set(orders_df["account_id"]).issubset(
    set(accounts_df["account_id"])
), "All account_ids in ORDER should exist in ACCOUNT."
assert set(transactions_df["account_id"]).issubset(
    set(accounts_df["account_id"])
), "All account_ids in TRANSACTION should exist in ACCOUNT."
assert set(loans_df["account_id"]).issubset(
    set(accounts_df["account_id"])
), "All account_ids in LOAN should exist in ACCOUNT."

# Verify each client_id in DISPOSITION exists in CLIENT
assert set(dispositions_df["client_id"]).issubset(
    set(clients_df["client_id"])
), "All client_ids in DISPOSITION should exist in CLIENT."

# Verify each disp_id in CARD exists in DISPOSITION
assert set(cards_df["disp_id"]).issubset(
    set(dispositions_df["disp_id"])
), "All disp_ids in CARD should exist in DISPOSITION."

2 Data Preparation: Non-Transactional Data

orders_pivot_df = orders_df.pivot_table(
    index="account_id",
    columns="k_symbol",
    values="debited_amount",
    aggfunc="sum",
    fill_value=0,
)

orders_pivot_df.columns = [
    f"k_symbol_debited_sum_{col.lower()}" for col in orders_pivot_df.columns
]
orders_pivot_df = orders_pivot_df.reset_index()  # Use created index as account_id
orders_pivot_df.head()
account_id k_symbol_debited_sum_household k_symbol_debited_sum_insurance_payment k_symbol_debited_sum_leasing k_symbol_debited_sum_loan_payment k_symbol_debited_sum_na
0 1 2452.0 0.0 0.0 0.0 0.0
1 2 7266.0 0.0 0.0 3372.7 0.0
2 3 1135.0 3539.0 0.0 0.0 327.0
3 4 3363.0 0.0 0.0 0.0 0.0
4 5 2668.0 0.0 0.0 0.0 0.0
def merge_non_transactional_data(
    clients, districts, dispositions, accounts, orders, loans, cards
):
    # Rename district_id for clarity in clients and accounts DataFrames
    clients = clients.rename(columns={"district_id": "client_district_id"})
    accounts = accounts.rename(columns={"district_id": "account_district_id"})

    # Prepare districts dataframe for merge with prefix for clients and accounts
    districts_client_prefixed = districts.add_prefix("client_")
    districts_account_prefixed = districts.add_prefix("account_")

    # Merge district information for clients and accounts with prefixed columns
    clients_with_districts = pd.merge(
        clients,
        districts_client_prefixed,
        left_on="client_district_id",
        right_on="client_district_id",
        how="left",
    )
    accounts_with_districts = pd.merge(
        accounts,
        districts_account_prefixed,
        left_on="account_district_id",
        right_on="account_district_id",
        how="left",
    )

    # Merge cards with dispositions and prefix card-related columns to avoid confusion
    cards_prefixed = cards.add_prefix("card_")
    dispositions_with_cards = pd.merge(
        dispositions,
        cards_prefixed,
        left_on="disp_id",
        right_on="card_disp_id",
        how="left",
    )

    # Merge clients (with district info) with dispositions and cards
    # Assuming dispositions might have columns that overlap with clients, prefix those if necessary
    clients_dispositions_cards = pd.merge(
        dispositions_with_cards, clients_with_districts, on="client_id", how="left"
    )

    # Merge the above with accounts (with district info) on account_id
    accounts_clients_cards = pd.merge(
        accounts_with_districts, clients_dispositions_cards, on="account_id", how="left"
    )

    # Merge orders DataFrame, assuming orders might contain columns that could overlap, prefix as needed
    orders_prefixed = orders.add_prefix("order_")
    comprehensive_df_with_orders = pd.merge(
        accounts_clients_cards,
        orders_prefixed,
        left_on="account_id",
        right_on="order_account_id",
        how="left",
    )

    # Merge loans with the comprehensive dataframe (now including orders) on account_id
    # Prefix loan-related columns to maintain clarity
    loans_prefixed = loans.add_prefix("loan_")
    final_df = pd.merge(
        comprehensive_df_with_orders,
        loans_prefixed,
        left_on="account_id",
        right_on="loan_account_id",
        how="left",
    )

    final_df["account_created"] = pd.to_datetime(final_df["account_created"])
    final_df["card_issued"] = pd.to_datetime(final_df["card_issued"])
    final_df["has_card"] = final_df["card_issued"].notna()
    return final_df


non_transactional_df = merge_non_transactional_data(
    clients_df,
    districts_df,
    dispositions_df,
    accounts_df,
    orders_pivot_df,
    loans_df,
    cards_df,
)
non_transactional_df.to_csv("data/non_transactional.csv", index=False)
non_transactional_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4500 entries, 0 to 4499
Data columns (total 59 columns):
 #   Column                                        Non-Null Count  Dtype         
---  ------                                        --------------  -----         
 0   account_id                                    4500 non-null   int64         
 1   account_district_id                           4500 non-null   int64         
 2   account_frequency                             4500 non-null   object        
 3   account_created                               4500 non-null   datetime64[ns]
 4   account_district_name                         4500 non-null   object        
 5   account_region                                4500 non-null   object        
 6   account_inhabitants                           4500 non-null   int64         
 7   account_small_municipalities                  4500 non-null   int64         
 8   account_medium_municipalities                 4500 non-null   int64         
 9   account_large_municipalities                  4500 non-null   int64         
 10  account_huge_municipalities                   4500 non-null   int64         
 11  account_cities                                4500 non-null   int64         
 12  account_ratio_urban_inhabitants               4500 non-null   float64       
 13  account_average_salary                        4500 non-null   int64         
 14  account_unemployment_rate_1995                4452 non-null   float64       
 15  account_unemployment_rate_1996                4500 non-null   float64       
 16  account_entrepreneurs_per_1000_inhabitants    4500 non-null   int64         
 17  account_crimes_committed_1995                 4452 non-null   float64       
 18  account_crimes_committed_1996                 4500 non-null   int64         
 19  disp_id                                       4500 non-null   int64         
 20  client_id                                     4500 non-null   int64         
 21  type                                          4500 non-null   object        
 22  card_card_id                                  892 non-null    float64       
 23  card_disp_id                                  892 non-null    float64       
 24  card_type                                     892 non-null    object        
 25  card_issued                                   892 non-null    datetime64[ns]
 26  client_district_id                            4500 non-null   int64         
 27  sex                                           4500 non-null   object        
 28  birth_date                                    4500 non-null   datetime64[ns]
 29  age                                           4500 non-null   int64         
 30  client_district_name                          4500 non-null   object        
 31  client_region                                 4500 non-null   object        
 32  client_inhabitants                            4500 non-null   int64         
 33  client_small_municipalities                   4500 non-null   int64         
 34  client_medium_municipalities                  4500 non-null   int64         
 35  client_large_municipalities                   4500 non-null   int64         
 36  client_huge_municipalities                    4500 non-null   int64         
 37  client_cities                                 4500 non-null   int64         
 38  client_ratio_urban_inhabitants                4500 non-null   float64       
 39  client_average_salary                         4500 non-null   int64         
 40  client_unemployment_rate_1995                 4448 non-null   float64       
 41  client_unemployment_rate_1996                 4500 non-null   float64       
 42  client_entrepreneurs_per_1000_inhabitants     4500 non-null   int64         
 43  client_crimes_committed_1995                  4448 non-null   float64       
 44  client_crimes_committed_1996                  4500 non-null   int64         
 45  order_account_id                              3758 non-null   float64       
 46  order_k_symbol_debited_sum_household          3758 non-null   float64       
 47  order_k_symbol_debited_sum_insurance_payment  3758 non-null   float64       
 48  order_k_symbol_debited_sum_leasing            3758 non-null   float64       
 49  order_k_symbol_debited_sum_loan_payment       3758 non-null   float64       
 50  order_k_symbol_debited_sum_na                 3758 non-null   float64       
 51  loan_loan_id                                  682 non-null    float64       
 52  loan_account_id                               682 non-null    float64       
 53  loan_granted_date                             682 non-null    datetime64[ns]
 54  loan_amount                                   682 non-null    float64       
 55  loan_duration                                 682 non-null    float64       
 56  loan_monthly_payments                         682 non-null    float64       
 57  loan_status                                   682 non-null    object        
 58  has_card                                      4500 non-null   bool          
dtypes: bool(1), datetime64[ns](4), float64(21), int64(24), object(9)
memory usage: 2.0+ MB

3 Exploratory Data Analysis

3.1 Non-transactional Data

3.1.1 Card Holders

plt.figure()
plt.title("Number of Clients by Card Type")
sns.barplot(
    x=["No Card", "Classic/Gold Card Holders", "Junior Card Holders"],
    y=[
        non_transactional_df["card_type"].isna().sum(),
        non_transactional_df["card_type"].isin(["gold", "classic"]).sum(),
        non_transactional_df["card_type"].eq("junior").sum(),
    ],
)
# ensure that the number of clients is shown on the bars
for i, v in enumerate(
    [
        non_transactional_df["card_type"].isna().sum(),
        non_transactional_df["card_type"].isin(["gold", "classic"]).sum(),
        non_transactional_df["card_type"].eq("junior").sum(),
    ]
):
    plt.text(i, v + 10, str(v), ha="center", va="bottom")

plt.show()

Looking at the distribution of card holders in general we can see that the most clients are not in a possession of a credit card.

plt.figure()
plt.title(
    f'Distribution of Age for Junior Card Holders\n total count = {len(non_transactional_df[non_transactional_df["card_type"] == "junior"])}'
)
sns.histplot(
    non_transactional_df[non_transactional_df["card_type"] == "junior"]["age"],
    kde=True,
    bins=30,
)
plt.xlabel("Age of Client (presumably in 1999)")
plt.show()

Looking at the age distribution of Junior Card holders paints a picture on this group, however only looking at the current age may be misleading as we need to understand how old they were when the card was issued to determine if they could have been eligble for a Classic/Gold card (at least 18 when the card was issued).

non_transactional_df["card_issued"] = pd.to_datetime(
    non_transactional_df["card_issued"]
)

non_transactional_df["age_at_card_issuance"] = (
    non_transactional_df["card_issued"] - non_transactional_df["birth_date"]
)
non_transactional_df["age_at_card_issuance"] = (
    non_transactional_df["age_at_card_issuance"].dt.days // 365
)

plt.figure()
plt.title(
    f'Distribution of Age at Card Issuance for Junior Card Holders\n total count = {len(non_transactional_df[non_transactional_df["card_type"] == "junior"])}'
)
sns.histplot(
    non_transactional_df[non_transactional_df["card_type"] == "junior"][
        "age_at_card_issuance"
    ],
    kde=True,
    bins=30,
)
plt.xlabel("Age at Card Issuance")
plt.show()

Here we can see that roughly 1/3 of the Junior Card holders were not of legal age (assuming legal age is 18) when receiving their Junior Card.

plt.figure()
plt.title(
    f"Distribution of Age at Card Issuance for All Card Types\n total count = {len(non_transactional_df)}"
)
sns.histplot(
    non_transactional_df[non_transactional_df["card_type"] == "junior"][
        "age_at_card_issuance"
    ],
    kde=True,
    bins=10,
    color="blue",
    label="Junior Card Holders",
)
sns.histplot(
    non_transactional_df[non_transactional_df["card_type"] != "junior"][
        "age_at_card_issuance"
    ],
    kde=True,
    bins=30,
    color="red",
    label="Non-Junior Card Holders",
)
plt.legend()
plt.xlabel("Age at Card Issuance")
plt.show()

Comparing the age at issue date between Junior and non-Junior (Classic/Gold) card holders shows that there is no overlap between the two groups, which makes intutively sense.

Therefore removing the subset of Junior Cards seems as valid as there is no reason to believe that there are Junior Cards issued wrongly, the subset being relatively small compared to the remaining issued cards and the fact that our target is specifically Classic/Gold Card owners.

before_len = len(non_transactional_df)
non_transactional_df = non_transactional_df[
    non_transactional_df["card_type"] != "junior"
]
data_reduction["Junior Card Holders"] = -(before_len - len(non_transactional_df))
del before_len

Looking at the age distribution of Junior card holders and their occurence in comparison it seems valid to remove them as they are not the target group and make up a small subset of the complete dataset.

3.1.2 Time factors on Card Status

The time between creating an account and issuing a card may also be important when filtering customers based on their history. We should avoid filtering out potentially interesting periods and understand how the timespans between account creation and card issuance are distributed.

non_transactional_w_cards_df = non_transactional_df[
    non_transactional_df["card_issued"].notna()
    & non_transactional_df["account_created"].notna()
]
non_transactional_w_cards_df["duration_days"] = (
    non_transactional_w_cards_df["card_issued"]
    - non_transactional_w_cards_df["account_created"]
).dt.days

plt.figure(figsize=(8, 6))
sns.histplot(
    non_transactional_w_cards_df["duration_days"], bins=50, edgecolor="black", kde=True
)
plt.title("Distribution of Duration Between Account Creation and Card Issuance")
plt.xlabel("Duration in Days")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()
/tmp/ipykernel_1845/17211290.py:5: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

The histogram displays a distribution with multiple peaks, indicating that there are several typical time frames for card issuance after account creation. The highest peak occurs within the first 250 days, suggesting that a significant number of cards are issued during this period. The frequency decreases as duration increases, with noticeable peaks that may correspond to specific processing batch cycles or policy changes over time. The distribution also has a long tail, suggesting that in some cases, card issuance can take a very long time.

Analyzing the length of time a client has been with the bank in relation to their account creation date and card ownership can provide valuable insights for a bank’s customer relationship management and product targeting strategies. Long-standing clients may exhibit different banking behaviors, such as product adoption and loyalty patterns, compared to newer clients.

max_account_creation_date = non_transactional_df["card_issued"].max()

non_transactional_df["client_tenure_years_relative"] = (
    max_account_creation_date - non_transactional_df["account_created"]
).dt.days / 365.25

plt.figure()
ax = sns.histplot(
    data=non_transactional_df,
    x="client_tenure_years_relative",
    hue="has_card",
    multiple="stack",
    binwidth=1,
    stat="percent",
)

# Call the function to add labels
add_percentage_labels(ax, non_transactional_df["has_card"].unique())

# Additional plot formatting
plt.title("Client Tenure Relative to Latest Card Issued Date and Card Ownership")
plt.xlabel("Client Tenure (Years, Relative to Latest Card Issuance)")
plt.ylabel("Percentage of Clients")

# Display the plot
plt.show()

The bar chart shows the tenure of clients in years, categorized by whether they own a credit card (True) or not (False). Each bar represents the percentage of clients within a specific tenure range, allowing for comparison of the distribution of card ownership among clients with different lengths of association with the bank.

3.1.3 Demographics

Using the available demographic data, we can investigate the potential correlation between demographic data and card status. The average salary may indicate a difference between cardholders and non-cardholders, as it is reasonable to assume that cardholders have a higher average salary than non-cardholders.

plt.figure()
sns.boxplot(x="has_card", y="client_average_salary", data=non_transactional_df)
plt.title("Average Salary in Client's Region by Card Ownership")
plt.xlabel("Has Card")
plt.ylabel("Average Salary")
plt.xticks([0, 1], ["No Card Owner", "Card Owner"])

plt.show()

The box plot compares the average salaries of clients who own a credit card with those who do not. Both groups have a substantial overlap in salary ranges, suggesting that while there might be a trend for card owners to have higher salaries, the difference is not significant. The median salary for card owners is slightly higher than that for non-card owners, as indicated by the median line within the respective boxes.

Both distributions have outliers on the higher end, indicating that some individuals have salaries significantly above the average in both groups. However, these outliers do not dominate the general trend.

It should also be noted that this plot assumes that the average salary of the region’s clients remained constant over the years, which is unlikely to be true.

The group of bar charts represents the distribution of credit card ownership across various demographics, showing the percentage of clients with and without cards within different age groups, sexes, and regions.

non_transactional_df["age_group"] = pd.cut(
    non_transactional_df["age"],
    bins=[0, 25, 40, 55, 70, 100],
    labels=["<25", "25-40", "40-55", "55-70", ">70"],
)

plt.figure(figsize=(8, 12))

# Age Group
plt.subplot(3, 1, 1)
age_group_counts = (
    non_transactional_df.groupby(["age_group", "has_card"]).size().unstack(fill_value=0)
)
age_group_percentages = (age_group_counts.T / age_group_counts.sum(axis=1)).T * 100
age_group_plot = age_group_percentages.plot(kind="bar", stacked=True, ax=plt.gca())
age_group_plot.set_title("Card Ownership by Age Group")
age_group_plot.set_ylabel("Percentage")
add_percentage_labels(age_group_plot, non_transactional_df["has_card"].unique())

# Sex
plt.subplot(3, 1, 2)
sex_counts = (
    non_transactional_df.groupby(["sex", "has_card"]).size().unstack(fill_value=0)
)
sex_percentages = (sex_counts.T / sex_counts.sum(axis=1)).T * 100
sex_plot = sex_percentages.plot(kind="bar", stacked=True, ax=plt.gca())
sex_plot.set_title("Card Ownership by Sex")
sex_plot.set_ylabel("Percentage")
add_percentage_labels(sex_plot, non_transactional_df["has_card"].unique())

# Client Region
plt.subplot(3, 1, 3)
region_counts = (
    non_transactional_df.groupby(["client_region", "has_card"])
    .size()
    .unstack(fill_value=0)
)
region_percentages = (region_counts.T / region_counts.sum(axis=1)).T * 100
region_plot = region_percentages.plot(kind="bar", stacked=True, ax=plt.gca())
region_plot.set_title("Card Ownership by Client Region")
region_plot.set_ylabel("Percentage")
region_plot.tick_params(axis="x", rotation=45)
add_percentage_labels(region_plot, non_transactional_df["has_card"].unique())

plt.tight_layout()
plt.show()
/tmp/ipykernel_1845/271218705.py:12: FutureWarning:

The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning.

Card Ownership by Age Group: The bar chart displays the proportion of cardholders in different age groups. The percentage of cardholders is lowest in the age group of over 70, followed by the age group of 55-70, indicating that card ownership is more prevalent among younger demographics.

Card Ownership by Sex: The bar chart shows the breakdown of card ownership by sex. The data reveals that the percentage of cardholders is comparable between both sexes, and no significant difference is present.

Card Ownership by Region The bar chart at the bottom illustrates card ownership across different regions, showing a relatively consistent pattern among most regions.

3.1.4 Impact of Loans / Debt

simplified_loan_status_mapping = {
    "Contract finished, no problems": "Finished",
    "Contract finished, loan not paid": "Not Paid",
    "Contract running, OK thus-far": "Running",
    "Contract running, client in debt": "In Debt",
    "No Loan": "No Loan",
}

non_transactional_df["loan_status_simplified"] = non_transactional_df[
    "loan_status"
].map(simplified_loan_status_mapping)

# this variable wants to kill itself
loan_status_simplified_card_ownership_counts = (
    non_transactional_df.groupby(["loan_status_simplified", "has_card"])
    .size()
    .unstack(fill_value=0)
)
loan_status_simplified_card_ownership_percentages = (
    loan_status_simplified_card_ownership_counts.T
    / loan_status_simplified_card_ownership_counts.sum(axis=1)
).T * 100

loan_status_simplified_card_ownership_percentages.plot(
    kind="bar", stacked=True, figsize=(8, 6)
)
plt.title("Interaction Between Simplified Loan Status and Card Ownership")
plt.xlabel("Simplified Loan Status")
plt.ylabel("Percentage of Clients")
plt.xticks(rotation=45)
plt.legend(title="Has Card", labels=["No Card", "Has Card"])
plt.tight_layout()
plt.show()

3.2 Transactional Data

TODO: Add more EDA for transactional data

zero_amount_transactions_df = transactions_df[transactions_df["amount"] == 0]

zero_amount_transactions_info = {
    "total_zero_amount_transactions": len(zero_amount_transactions_df),
    "unique_accounts_with_zero_amount": zero_amount_transactions_df[
        "account_id"
    ].nunique(),
    "transaction_type_distribution": zero_amount_transactions_df[
        "transaction_type"
    ].value_counts(normalize=True),
    "operation_distribution": zero_amount_transactions_df["operation"].value_counts(
        normalize=True
    ),
    "k_symbol_distribution": zero_amount_transactions_df["k_symbol"].value_counts(
        normalize=True
    ),
}

zero_amount_transactions_info, len(zero_amount_transactions_info)
({'total_zero_amount_transactions': 14,
  'unique_accounts_with_zero_amount': 12,
  'transaction_type_distribution': transaction_type
  Withdrawal    0.714286
  Credit        0.285714
  Name: proportion, dtype: float64,
  'operation_distribution': operation
  Withdrawal in Cash    0.714286
  NA                    0.285714
  Name: proportion, dtype: float64,
  'k_symbol_distribution': k_symbol
  Sanction Interest    0.714286
  Interest Credited    0.285714
  Name: proportion, dtype: float64},
 5)
accounts_with_zero_amount_transactions = accounts_df[
    accounts_df["account_id"].isin(zero_amount_transactions_df["account_id"].unique())
]
accounts_with_zero_amount_transactions
account_id district_id account_frequency account_created
178 5369 54 MONTHLY_ISSUANCE 1993-02-25
289 5483 13 MONTHLY_ISSUANCE 1993-03-28
496 5129 68 MONTHLY_ISSUANCE 1993-06-08
513 1475 1 WEEKLY_ISSUANCE 1993-06-14
799 9337 30 MONTHLY_ISSUANCE 1993-09-13
896 102 11 MONTHLY_ISSUANCE 1993-10-16
986 8957 1 MONTHLY_ISSUANCE 1993-11-13
2033 5125 1 MONTHLY_ISSUANCE 1995-09-14
2300 9051 5 WEEKLY_ISSUANCE 1996-01-17
2651 3859 53 MONTHLY_ISSUANCE 1996-04-23
3212 6083 6 WEEKLY_ISSUANCE 1996-09-19
3342 1330 68 MONTHLY_ISSUANCE 1996-10-22
# Clean up unnecessary variables
del accounts_with_zero_amount_transactions
del zero_amount_transactions_df
del zero_amount_transactions_info

Validating first transactions where the amount equals the balance is essential for the integrity of our aggregated data analysis. This specific assertion underpins the reliability of our subsequent aggregation operations by ensuring each account’s financial history starts from a verifiable point.

def validate_first_transactions(transactions):
    """
    Validates that for each account in the transactions DataFrame, there is at least
    one transaction where the amount equals the balance on the account's first transaction date.

    Parameters:
    - transactions (pd.DataFrame): DataFrame containing transaction data with columns
      'account_id', 'date', 'amount', and 'balance'.

    Raises:
    - AssertionError: If not every account has a first transaction where the amount equals the balance.
    """

    first_dates = (
        transactions.groupby("account_id")["date"].min().reset_index(name="first_date")
    )

    first_trans = pd.merge(transactions, first_dates, how="left", on=["account_id"])

    first_trans_filtered = first_trans[
        (first_trans["date"] == first_trans["first_date"])
        & (first_trans["amount"] == first_trans["balance"])
    ]

    first_trans_filtered = first_trans_filtered.drop_duplicates(subset=["account_id"])

    unique_accounts = transactions["account_id"].nunique()
    assert (
        unique_accounts == first_trans_filtered["account_id"].nunique()
    ), "Not every account has a first transaction where the amount equals the balance."

    return "Validation successful: Each account has a first transaction where the amount equals the balance."


validate_first_transactions(transactions_df)
'Validation successful: Each account has a first transaction where the amount equals the balance.'

We can confirm the truth of the assertions made. It is certain that there is a transaction with an amount equal to the balance in the transaction history of any account on the first date.

## DEPENDENCY 1 TODO REMOVE FOR MERGE 
import json
# save transactions_df to temp as parquet

transactions_df.to_parquet("temp/transactions.parquet")
accounts_df.to_parquet("temp/accounts.parquet")
non_transactional_df.to_parquet("temp/non_transactional.parquet")

# save data reduction
with open("temp/data_reduction.json", "w") as f:
    json.dump(data_reduction, f)
## DEPENDENCY #TODO REMOVE FOR MERGE
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json


transactions_df = pd.read_parquet("temp/transactions.parquet")
accounts_df = pd.read_parquet("temp/accounts.parquet")
non_transactional_df = pd.read_parquet("temp/non_transactional.parquet")
# read data_reduction from temp/data_reduction.json
with open("temp/data_reduction.json", "r") as f:
    data_reduction = json.load(f)

4 Data Preparation: Transactional Data

4.1 Set artificial issue date for non-card holders

def add_months_since_account_to_card(df):
    df["months_since_account_to_card"] = df.apply(
        lambda row: (
            (
                row["card_issued"].to_period("M")
                - row["account_created"].to_period("M")
            ).n
            if pd.notnull(row["card_issued"]) and pd.notnull(row["account_created"])
            else np.nan
        ),
        axis=1,
    )
    return df


def filter_clients_without_sufficient_history(
    non_transactional_df, min_history_months=25
):
    if "months_since_account_to_card" not in non_transactional_df.columns:
        print(
            "Warning: months_since_account_to_card column not found. Calculating history length."
        )
        non_transactional_df = add_months_since_account_to_card(non_transactional_df)

    count_before = len(non_transactional_df)
    filtered_df = non_transactional_df[
        non_transactional_df["months_since_account_to_card"].isnull()
        | (non_transactional_df["months_since_account_to_card"] >= min_history_months)
    ]
    print(
        f"Filtered out {count_before - len(filtered_df)} records with less than {min_history_months} months of history. Percentage: {(count_before - len(filtered_df)) / count_before * 100:.2f}%."
    )
    return filtered_df


before_len = len(non_transactional_df)
non_transactional_w_sufficient_history_df = filter_clients_without_sufficient_history(
    non_transactional_df
)
data_reduction["Clients without sufficient history"] = -(
    before_len - len(non_transactional_w_sufficient_history_df)
)
del before_len
Warning: months_since_account_to_card column not found. Calculating history length.
Filtered out 419 records with less than 25 months of history. Percentage: 9.62%.
non_transactional_w_card_df = non_transactional_w_sufficient_history_df.dropna(
    subset=["card_issued"]
).copy()

plt.figure(figsize=(8, 6))
sns.histplot(
    non_transactional_w_card_df["months_since_account_to_card"], kde=True, bins=30
)
plt.title(
    "Distribution of Months from Account Creation to Card Issuance (for Card Holders)"
)
plt.xlabel("Months")
plt.ylabel("Count")
plt.grid(True)
plt.tight_layout()
plt.show()

4.2 Match by similar transaction activity

The following approaches were considered to match non-card holders with card holders:

  1. Looking at the distributions above extract the amount of history a buyer most likely has at the issue data of the card
  2. For each non buyer, find a buyer which was active in a similar time window (Jaccard similarity on the Year-Month sets). Instead of looking at the full activity of a buyer, we only look at the pre-purchase activity as there is reason to believe that clients may change their patterns after purchasing date and therefore add unwanted bias.

The second approach is chosen as it is provides an intuitive way to match clients based on their activity which is not only explainable but also provides a way to match clients based on their behavior. It strikes a balance of not finding a perfect match but a good enough match to focus on the discriminative features of the data.

The following image serves as an technical overview of the matching process:

The process emphasizes matching based on the timing of activity, rather than a wide array of characteristics. By identifying when both existing cardholders and non-cardholders interacted with the bank, we can infer a level of behavioral alignment that extends beyond mere transactional data. This alignment suggests a shared response to external conditions.

The resolution of the activity matrix is a binary matrix where each row represents a client and each column represents a month. A value of 1 indicates activity in a given month, while 0 indicates inactivity. Therefore we concentrate on the periods during which clients engage with the bank in the form of transactions

Assumption: This assumes that clients active during similar periods might be influenced by the same economic and societal conditions, providing a more nuanced foundation for establishing connections between current cardholders and potential new ones.

4.2.1 Construction of the Activity Matrix

The activity matrix serves as the foundation of our matching process, mapping out the engagement of clients with our services over time. It is constructed from transaction data, organizing client interactions into a structured format that highlights periods of activity.

  1. Data Aggregation: We start with transaction data, which records each client’s interactions across various months. This data includes every transaction made by both current cardholders and potential non-cardholders.

  2. Temporal Transformation: Each transaction is associated with a specific date. These dates are then transformed into monthly periods, consolidating daily transactions into a monthly view of activity. This step simplifies the data, focusing on the presence of activity within each month rather than the specific dates or frequencies of transactions.

  3. Matrix Structure: The transformed data is arranged into a matrix format. Rows represent individual clients, identified by their account IDs. Columns correspond to monthly periods, spanning the entire range of months covered by the transaction data.

  4. Activity Indication: In the matrix, a cell value is set to indicate the presence of activity for a given client in a given month. If a client made one or more transactions in a month, the corresponding cell is marked to reflect this activity. The absence of transactions for a client in a month leaves the cell unmarked.

  5. Binary Representation: The final step involves converting the activity indicators into a binary format. Active months are represented by a ‘1’, indicating the presence of transactions, while inactive months are denoted by a ‘0’, indicating no transactions.

The heatmap provided offers a visual representation of the activity matrix for clients, depicting the levels of engagement over various periods.

  • Diagonal Trend: There is a distinct diagonal pattern, indicating that newer accounts (those created more recently) have fewer periods of activity. This makes sense as these accounts have not had the opportunity to transact over the earlier periods displayed on the heatmap.

  • Darker Areas (Purple): These represent periods of inactivity where clients did not engage. The darker the shade, the less activity occurred in that particular period for the corresponding set of accounts.

  • Brighter Areas (Yellow): In contrast, the brighter areas denote periods of activity. A brighter shade implies more clients were active during that period.

  • Account Creation Date: Clients are sorted by their account creation date. Those who joined earlier are at the top, while more recent clients appear toward the bottom of the heatmap.

def prepare_activity_matrix(transactions):
    """
    Create an activity matrix from transaction data.

    The function transforms transaction data into a binary matrix that indicates
    whether an account was active in a given month.

    Parameters:
    - transactions (pd.DataFrame): A DataFrame containing the transaction data.

    Returns:
    - pd.DataFrame: An activity matrix with accounts as rows and months as columns.
    """
    transactions["month_year"] = transactions["date"].dt.to_period("M")
    transactions["active"] = 1

    activity_matrix = transactions.pivot_table(
        index="account_id", columns="month_year", values="active", fill_value=0
    )

    activity_matrix.columns = [f"active_{str(col)}" for col in activity_matrix.columns]
    return activity_matrix


def plot_activity_matrix(activity_matrix):
    sparse_matrix = activity_matrix.astype(bool)
    plt.figure(figsize=(8, 8))
    sns.heatmap(sparse_matrix, cmap="viridis", cbar=True, yticklabels=False)
    plt.title(f"Activity Matrix across all clients sorted by account creation date")
    plt.xlabel("Period")
    plt.ylabel("Accounts")
    plt.tight_layout()
    plt.show()


activity_matrix = prepare_activity_matrix(transactions_df)
plot_activity_matrix(activity_matrix)

4.2.2 Eligibility Criteria

After constructing the activity matrix, we check for eligibility of non-cardholders to be matched with cardholders. This ensures alignment for later model construction. The eligibility criteria are as follows:

  1. Account History: Non-cardholders must have an established history of interaction, with at least 25 months of history between account creation and card issuance (12 months (= New customer period) + 13 months (= one year of history) + 1 month (Lag period)).
  2. Account Creation Date: The account creation date of a non-cardholder must precede the card issuance date of the cardholder as this is a prerequisite for the matching process to work correctly when we set the issue date for non-card holders.
from sklearn.metrics import pairwise_distances
from tqdm import tqdm

ELIGIBILITY_THRESHOLD_HIST_MONTHS = 25


def check_eligibility_for_matching(non_cardholder, cardholder, verbose=False):
    """
    Determine if a non-cardholder is eligible for matching with a cardholder.

    This function checks whether the card issuance to a cardholder occurred at least
    25 months after the non-cardholder's account was created.

    Parameters:
    - non_cardholder (pd.Series): A data series containing the non-cardholder's details.
    - cardholder (pd.Series): A data series containing the cardholder's details.
    - verbose (bool): If True, print detailed eligibility information. Default is False.

    Returns:
    - bool: True if the non-cardholder is eligible for matching, False otherwise.
    """
    if cardholder["card_issued"] <= non_cardholder["account_created"]:
        return False

    period_diff = (
        cardholder["card_issued"].to_period("M")
        - non_cardholder["account_created"].to_period("M")
    ).n

    if verbose:
        print(
            f"Card issued: {cardholder['card_issued']}, Account created: {non_cardholder['account_created']}, Period diff: {period_diff}, Eligible: {period_diff >= ELIGIBILITY_THRESHOLD_HIST_MONTHS}"
        )

    return period_diff >= ELIGIBILITY_THRESHOLD_HIST_MONTHS

4.2.3 Matching Process

Next up we will implement the matching process. Our matching utilizes the Jaccard similarity index to compare activity patterns: We compare a vector representing an existing cardholder’s monthly activity against a matrix of non-cardholders’ activity patterns. Here we only consider the activity from the first transaction period across all customers to the card issue date.

The Jaccard similarity index is calculated as the intersection of active months divided by the union of active months between the two clients. This index ranges from 0 to 1, with higher values indicating greater similarity in activity patterns.

\[J(A, B) = \frac{|A \cap B|}{|A \cup B|}\]

The function match_cardholders_with_non_cardholders will perform the following steps:

  1. Data Preparation: The function prepares the activity matrix and splits the non-cardholders into two groups: those with and without cards.
  2. Matching Process: For each cardholder, the function calculates the Jaccard similarity between their activity pattern and those of eligible non-cardholders. It then selects the top N similar non-cardholders and randomly assigns one match per cardholder.
  3. Match Selection: The function selects a non-cardholder match for each cardholder based on the Jaccard similarity scores. It ensures that each non-cardholder is matched only once and that the top N similar non-cardholders are considered for matching.
    1. The selection among the top N similar non-cardholders is done randomly to avoid bias. This process is defined in the select_non_cardholders function.
    2. The function also checks for the eligibility as defined above.
    3. If no eligible non-cardholders are found, the function prints a warning message.
  4. Output: The function returns a list of tuples containing the matched cardholder and non-cardholder client IDs along with their similarity scores.
def select_non_cardholders(
    distances,
    eligible_non_cardholders,
    matches,
    matched_applicants,
    cardholder,
    without_card_activity,
    top_n,
):
    """
    Randomly select a non-cardholder match for a cardholder from the top N eligible candidates.

    Parameters:
    - distances (np.array): An array of Jaccard distances between a cardholder and non-cardholders.
    - eligible_non_cardholders (list): A list of indices for non-cardholders who are eligible for matching.
    - matches (list): A list to which the match will be appended.
    - matched_applicants (set): A set of indices for non-cardholders who have already been matched.
    - cardholder (pd.Series): The data series of the current cardholder.
    - without_card_activity (pd.DataFrame): A DataFrame of non-cardholders without card issuance.
    - top_n (int): The number of top similar non-cardholders to consider for matching.

    Returns:
    - None: The matches list is updated in place with the selected match.
    """
    eligible_distances = distances[eligible_non_cardholders]
    sorted_indices = np.argsort(eligible_distances)[:top_n]

    if sorted_indices.size > 0:
        selected_index = np.random.choice(sorted_indices)
        actual_selected_index = eligible_non_cardholders[selected_index]

        if actual_selected_index not in matched_applicants:
            matched_applicants.add(actual_selected_index)
            applicant = without_card_activity.iloc[actual_selected_index]
            similarity = 1 - eligible_distances[selected_index]

            matches.append(
                (cardholder["client_id"], applicant["client_id"], similarity)
            )


def match_cardholders_with_non_cardholders(non_transactional, transactions, top_n=5):
    """
    Match cardholders with non-cardholders based on the similarity of their activity patterns.

    The function creates an activity matrix, identifies eligible non-cardholders, calculates
    the Jaccard similarity to find matches, and randomly selects one match per cardholder
    from the top N similar non-cardholders.

    Parameters:
    - non_transactional (pd.DataFrame): A DataFrame containing non-cardholders.
    - transactions (pd.DataFrame): A DataFrame containing transactional data.
    - top_n (int): The number of top similar non-cardholders to consider for matching.

    Returns:
    - list: A list of tuples with the cardholder and matched non-cardholder client IDs and similarity scores.
    """
    with_card = non_transactional[non_transactional["card_issued"].notna()]
    without_card = non_transactional[non_transactional["card_issued"].isna()]

    activity_matrix = prepare_activity_matrix(transactions)

    with_card_activity = with_card.join(activity_matrix, on="account_id", how="left")
    without_card_activity = without_card.join(
        activity_matrix, on="account_id", how="left"
    )

    matched_non_cardholders = set()
    matches = []

    for idx, cardholder in tqdm(
        with_card_activity.iterrows(),
        total=len(with_card_activity),
        desc="Matching cardholders",
    ):
        issue_period = cardholder["card_issued"].to_period("M")
        eligible_cols = [
            col
            for col in activity_matrix
            if col.startswith("active") and pd.Period(col.split("_")[1]) <= issue_period
        ]

        if not eligible_cols:
            print(
                f"No eligible months found for cardholder client_id {cardholder['client_id']}."
            )
            continue
        
        cardholder_vector = cardholder[eligible_cols].values.reshape(1, -1)
        non_cardholder_matrix = without_card_activity[eligible_cols].values
        
        cardholder_vector = np.where(cardholder_vector > 0, 1, 0).astype(bool)
        non_cardholder_matrix = np.where(non_cardholder_matrix > 0, 1, 0).astype(bool)

        assert (
            cardholder_vector.shape[1] == non_cardholder_matrix.shape[1]
        ), "Dimension mismatch between cardholder and applicant activity matrix."

        distances = pairwise_distances(
            cardholder_vector, non_cardholder_matrix, 
            metric="jaccard", n_jobs=-1 
        ).flatten()
        eligible_non_cardholders = [
            i
            for i, applicant in without_card_activity.iterrows()
            if check_eligibility_for_matching(applicant, cardholder)
            and i not in matched_non_cardholders
        ]

        if eligible_non_cardholders:
            select_non_cardholders(
                distances,
                eligible_non_cardholders,
                matches,
                matched_non_cardholders,
                cardholder,
                without_card_activity,
                top_n,
            )
        else:
            print(
                f"No eligible non-cardholders found for cardholder client_id {cardholder['client_id']}."
            )

    return matches

TODO: Visualise the matching process

The matching process is executed, and the results are stored in the matched_non_card_holders_df DataFrame. The percentage of clients with a card issued before and after matching is calculated to assess the impact of the matching process. We expect the percentage of clients with a card issued to increase by 100% after matching, as each non-cardholder should be matched with a cardholder.

Last but not least we set the artificial card issue date for each non-cardholder based on the matching results.

def set_artificial_issue_dates(non_transactional_df, matches):
    """
    Augment the non-transactional DataFrame with artificial card issue dates based on matching results.

    Each matched non-cardholder is assigned a card issue date corresponding to their matched
    cardholder. The 'has_card' flag for each non-cardholder is updated accordingly.

    Parameters:
    - non_transactional_df (pd.DataFrame): The DataFrame of non-cardholders to augment.
    - matches (list): A list of tuples containing the matched cardholder and non-cardholder IDs and similarity scores.

    Returns:
    - pd.DataFrame: The augmented DataFrame with artificial card issue dates.
    """
    augmented_df = non_transactional_df.copy()
    augmented_df["has_card"] = True

    for cardholder_id, non_cardholder_id, _ in matches:
        card_issue_date = augmented_df.loc[
            augmented_df["client_id"] == cardholder_id, "card_issued"
        ].values[0]
        augmented_df.loc[
            augmented_df["client_id"] == non_cardholder_id, ["card_issued", "has_card"]
        ] = [card_issue_date, False]

    return augmented_df


matched_non_card_holders_df = match_cardholders_with_non_cardholders(
    non_transactional_w_sufficient_history_df, transactions_df
)

print(
    f"Percentage of clients with card issued: {non_transactional_w_sufficient_history_df['card_issued'].notna().mean() * 100:.2f}%"
)
matched_non_card_holders_w_issue_date_df = set_artificial_issue_dates(
    non_transactional_w_sufficient_history_df, matched_non_card_holders_df
)
print(
    f"Percentage of clients with card issued after matching: {matched_non_card_holders_w_issue_date_df['card_issued'].notna().mean() * 100:.2f}%"
)
Matching cardholders:   0%|          | 0/328 [00:00<?, ?it/s]Matching cardholders:   0%|          | 1/328 [00:00<01:54,  2.86it/s]Matching cardholders:   1%|          | 2/328 [00:00<01:52,  2.90it/s]Matching cardholders:   1%|          | 3/328 [00:01<01:51,  2.91it/s]Matching cardholders:   1%|          | 4/328 [00:01<01:44,  3.11it/s]Matching cardholders:   2%|▏         | 5/328 [00:01<01:39,  3.24it/s]Matching cardholders:   2%|▏         | 6/328 [00:01<01:43,  3.12it/s]Matching cardholders:   2%|▏         | 7/328 [00:02<01:43,  3.10it/s]Matching cardholders:   2%|▏         | 8/328 [00:02<01:44,  3.07it/s]Matching cardholders:   3%|▎         | 9/328 [00:02<01:33,  3.41it/s]Matching cardholders:   3%|▎         | 10/328 [00:03<01:25,  3.71it/s]Matching cardholders:   3%|▎         | 11/328 [00:03<01:31,  3.46it/s]Matching cardholders:   4%|▎         | 12/328 [00:03<01:35,  3.29it/s]Matching cardholders:   4%|▍         | 13/328 [00:04<01:55,  2.73it/s]Matching cardholders:   4%|▍         | 14/328 [00:04<01:47,  2.93it/s]Matching cardholders:   5%|▍         | 15/328 [00:04<01:46,  2.94it/s]Matching cardholders:   5%|▍         | 16/328 [00:05<01:34,  3.31it/s]Matching cardholders:   5%|▌         | 17/328 [00:05<01:25,  3.63it/s]Matching cardholders:   5%|▌         | 18/328 [00:05<01:20,  3.85it/s]Matching cardholders:   6%|▌         | 19/328 [00:05<01:26,  3.58it/s]Matching cardholders:   6%|▌         | 20/328 [00:06<01:29,  3.44it/s]Matching cardholders:   6%|▋         | 21/328 [00:06<01:33,  3.29it/s]Matching cardholders:   7%|▋         | 22/328 [00:06<01:35,  3.19it/s]Matching cardholders:   7%|▋         | 23/328 [00:07<01:37,  3.14it/s]Matching cardholders:   7%|▋         | 24/328 [00:07<01:38,  3.10it/s]Matching cardholders:   8%|▊         | 25/328 [00:07<01:38,  3.06it/s]Matching cardholders:   8%|▊         | 26/328 [00:08<01:29,  3.38it/s]Matching cardholders:   8%|▊         | 27/328 [00:08<01:23,  3.59it/s]Matching cardholders:   9%|▊         | 28/328 [00:08<01:27,  3.44it/s]Matching cardholders:   9%|▉         | 29/328 [00:08<01:27,  3.41it/s]Matching cardholders:   9%|▉         | 30/328 [00:09<01:30,  3.27it/s]Matching cardholders:   9%|▉         | 31/328 [00:09<01:29,  3.33it/s]Matching cardholders:  10%|▉         | 32/328 [00:09<01:21,  3.62it/s]Matching cardholders:  10%|█         | 33/328 [00:10<01:24,  3.50it/s]Matching cardholders:  10%|█         | 34/328 [00:10<01:33,  3.15it/s]Matching cardholders:  11%|█         | 35/328 [00:10<01:33,  3.13it/s]Matching cardholders:  11%|█         | 36/328 [00:10<01:25,  3.41it/s]Matching cardholders:  11%|█▏        | 37/328 [00:11<01:28,  3.28it/s]Matching cardholders:  12%|█▏        | 38/328 [00:11<01:30,  3.20it/s]Matching cardholders:  12%|█▏        | 39/328 [00:11<01:27,  3.32it/s]Matching cardholders:  12%|█▏        | 40/328 [00:12<01:27,  3.28it/s]Matching cardholders:  12%|█▎        | 41/328 [00:12<01:29,  3.22it/s]Matching cardholders:  13%|█▎        | 42/328 [00:12<01:29,  3.18it/s]Matching cardholders:  13%|█▎        | 43/328 [00:13<01:31,  3.12it/s]Matching cardholders:  13%|█▎        | 44/328 [00:13<01:25,  3.33it/s]Matching cardholders:  14%|█▎        | 45/328 [00:13<01:21,  3.48it/s]Matching cardholders:  14%|█▍        | 46/328 [00:14<01:21,  3.46it/s]Matching cardholders:  14%|█▍        | 47/328 [00:14<01:25,  3.30it/s]Matching cardholders:  15%|█▍        | 48/328 [00:14<01:27,  3.21it/s]Matching cardholders:  15%|█▍        | 49/328 [00:15<01:29,  3.13it/s]Matching cardholders:  15%|█▌        | 50/328 [00:15<01:28,  3.14it/s]Matching cardholders:  16%|█▌        | 51/328 [00:15<01:26,  3.21it/s]Matching cardholders:  16%|█▌        | 52/328 [00:15<01:20,  3.44it/s]Matching cardholders:  16%|█▌        | 53/328 [00:16<01:31,  3.02it/s]Matching cardholders:  16%|█▋        | 54/328 [00:16<01:22,  3.31it/s]Matching cardholders:  17%|█▋        | 55/328 [00:16<01:23,  3.26it/s]Matching cardholders:  17%|█▋        | 56/328 [00:17<01:25,  3.17it/s]Matching cardholders:  17%|█▋        | 57/328 [00:17<01:26,  3.12it/s]Matching cardholders:  18%|█▊        | 58/328 [00:17<01:23,  3.22it/s]Matching cardholders:  18%|█▊        | 59/328 [00:18<01:17,  3.48it/s]Matching cardholders:  18%|█▊        | 60/328 [00:18<01:16,  3.49it/s]Matching cardholders:  19%|█▊        | 61/328 [00:18<01:13,  3.65it/s]Matching cardholders:  19%|█▉        | 62/328 [00:18<01:17,  3.42it/s]Matching cardholders:  19%|█▉        | 63/328 [00:19<01:20,  3.28it/s]Matching cardholders:  20%|█▉        | 64/328 [00:19<01:21,  3.23it/s]Matching cardholders:  20%|█▉        | 65/328 [00:19<01:22,  3.18it/s]Matching cardholders:  20%|██        | 66/328 [00:20<01:21,  3.22it/s]Matching cardholders:  20%|██        | 67/328 [00:20<01:22,  3.15it/s]Matching cardholders:  21%|██        | 68/328 [00:20<01:23,  3.10it/s]Matching cardholders:  21%|██        | 69/328 [00:21<01:21,  3.19it/s]Matching cardholders:  21%|██▏       | 70/328 [00:21<01:19,  3.24it/s]Matching cardholders:  22%|██▏       | 71/328 [00:21<01:20,  3.18it/s]Matching cardholders:  22%|██▏       | 72/328 [00:22<01:16,  3.35it/s]Matching cardholders:  22%|██▏       | 73/328 [00:22<01:18,  3.24it/s]Matching cardholders:  23%|██▎       | 74/328 [00:22<01:25,  2.98it/s]Matching cardholders:  23%|██▎       | 75/328 [00:23<01:24,  2.99it/s]Matching cardholders:  23%|██▎       | 76/328 [00:23<01:24,  2.98it/s]Matching cardholders:  23%|██▎       | 77/328 [00:23<01:22,  3.03it/s]Matching cardholders:  24%|██▍       | 78/328 [00:24<01:21,  3.07it/s]Matching cardholders:  24%|██▍       | 79/328 [00:24<01:21,  3.04it/s]Matching cardholders:  24%|██▍       | 80/328 [00:24<01:21,  3.06it/s]Matching cardholders:  25%|██▍       | 81/328 [00:25<01:16,  3.24it/s]Matching cardholders:  25%|██▌       | 82/328 [00:25<01:15,  3.26it/s]Matching cardholders:  25%|██▌       | 83/328 [00:25<01:17,  3.18it/s]Matching cardholders:  26%|██▌       | 84/328 [00:25<01:18,  3.11it/s]Matching cardholders:  26%|██▌       | 85/328 [00:26<01:16,  3.17it/s]Matching cardholders:  26%|██▌       | 86/328 [00:26<01:12,  3.34it/s]Matching cardholders:  27%|██▋       | 87/328 [00:26<01:10,  3.44it/s]Matching cardholders:  27%|██▋       | 88/328 [00:27<01:05,  3.64it/s]Matching cardholders:  27%|██▋       | 89/328 [00:27<01:03,  3.76it/s]Matching cardholders:  27%|██▋       | 90/328 [00:27<01:07,  3.53it/s]Matching cardholders:  28%|██▊       | 91/328 [00:27<01:10,  3.34it/s]Matching cardholders:  28%|██▊       | 92/328 [00:28<01:12,  3.27it/s]Matching cardholders:  28%|██▊       | 93/328 [00:28<01:14,  3.17it/s]Matching cardholders:  29%|██▊       | 94/328 [00:28<01:15,  3.12it/s]Matching cardholders:  29%|██▉       | 95/328 [00:29<01:15,  3.07it/s]Matching cardholders:  29%|██▉       | 96/328 [00:29<01:16,  3.04it/s]Matching cardholders:  30%|██▉       | 97/328 [00:30<01:23,  2.78it/s]Matching cardholders:  30%|██▉       | 98/328 [00:30<01:21,  2.82it/s]Matching cardholders:  30%|███       | 99/328 [00:30<01:19,  2.88it/s]Matching cardholders:  30%|███       | 100/328 [00:30<01:11,  3.20it/s]Matching cardholders:  31%|███       | 101/328 [00:31<01:08,  3.29it/s]Matching cardholders:  31%|███       | 102/328 [00:31<01:10,  3.20it/s]Matching cardholders:  31%|███▏      | 103/328 [00:31<01:12,  3.12it/s]Matching cardholders:  32%|███▏      | 104/328 [00:32<01:12,  3.08it/s]Matching cardholders:  32%|███▏      | 105/328 [00:32<01:09,  3.22it/s]Matching cardholders:  32%|███▏      | 106/328 [00:32<01:10,  3.16it/s]Matching cardholders:  33%|███▎      | 107/328 [00:33<01:09,  3.19it/s]Matching cardholders:  33%|███▎      | 108/328 [00:33<01:03,  3.45it/s]Matching cardholders:  33%|███▎      | 109/328 [00:33<00:59,  3.66it/s]Matching cardholders:  34%|███▎      | 110/328 [00:33<01:03,  3.42it/s]Matching cardholders:  34%|███▍      | 111/328 [00:34<01:06,  3.26it/s]Matching cardholders:  34%|███▍      | 112/328 [00:34<01:06,  3.24it/s]Matching cardholders:  34%|███▍      | 113/328 [00:34<01:04,  3.35it/s]Matching cardholders:  35%|███▍      | 114/328 [00:35<01:06,  3.22it/s]Matching cardholders:  35%|███▌      | 115/328 [00:35<01:07,  3.16it/s]Matching cardholders:  35%|███▌      | 116/328 [00:35<01:05,  3.22it/s]Matching cardholders:  36%|███▌      | 117/328 [00:36<01:06,  3.18it/s]Matching cardholders:  36%|███▌      | 118/328 [00:36<01:01,  3.43it/s]Matching cardholders:  36%|███▋      | 119/328 [00:36<01:03,  3.28it/s]Matching cardholders:  37%|███▋      | 120/328 [00:37<01:08,  3.04it/s]Matching cardholders:  37%|███▋      | 121/328 [00:37<01:08,  3.01it/s]Matching cardholders:  37%|███▋      | 122/328 [00:37<01:03,  3.24it/s]Matching cardholders:  38%|███▊      | 123/328 [00:38<01:02,  3.27it/s]Matching cardholders:  38%|███▊      | 124/328 [00:38<01:01,  3.30it/s]Matching cardholders:  38%|███▊      | 125/328 [00:38<01:02,  3.24it/s]Matching cardholders:  38%|███▊      | 126/328 [00:38<01:04,  3.15it/s]Matching cardholders:  39%|███▊      | 127/328 [00:39<00:59,  3.37it/s]Matching cardholders:  39%|███▉      | 128/328 [00:39<00:56,  3.52it/s]Matching cardholders:  39%|███▉      | 129/328 [00:39<01:02,  3.18it/s]Matching cardholders:  40%|███▉      | 130/328 [00:40<01:02,  3.18it/s]Matching cardholders:  40%|███▉      | 131/328 [00:40<01:03,  3.12it/s]Matching cardholders:  40%|████      | 132/328 [00:40<00:58,  3.37it/s]Matching cardholders:  41%|████      | 133/328 [00:41<00:59,  3.26it/s]Matching cardholders:  41%|████      | 134/328 [00:41<01:01,  3.16it/s]Matching cardholders:  41%|████      | 135/328 [00:41<00:59,  3.24it/s]Matching cardholders:  41%|████▏     | 136/328 [00:42<01:00,  3.16it/s]Matching cardholders:  42%|████▏     | 137/328 [00:42<01:01,  3.10it/s]Matching cardholders:  42%|████▏     | 138/328 [00:42<01:01,  3.07it/s]Matching cardholders:  42%|████▏     | 139/328 [00:43<01:02,  3.04it/s]Matching cardholders:  43%|████▎     | 140/328 [00:43<01:01,  3.07it/s]Matching cardholders:  43%|████▎     | 141/328 [00:43<00:57,  3.24it/s]Matching cardholders:  43%|████▎     | 142/328 [00:43<00:56,  3.29it/s]Matching cardholders:  44%|████▎     | 143/328 [00:44<01:04,  2.87it/s]Matching cardholders:  44%|████▍     | 144/328 [00:44<01:03,  2.91it/s]Matching cardholders:  44%|████▍     | 145/328 [00:45<00:59,  3.08it/s]Matching cardholders:  45%|████▍     | 146/328 [00:45<00:58,  3.09it/s]Matching cardholders:  45%|████▍     | 147/328 [00:45<00:59,  3.05it/s]Matching cardholders:  45%|████▌     | 148/328 [00:45<00:56,  3.19it/s]Matching cardholders:  45%|████▌     | 149/328 [00:46<00:54,  3.27it/s]Matching cardholders:  46%|████▌     | 150/328 [00:46<00:56,  3.16it/s]Matching cardholders:  46%|████▌     | 151/328 [00:46<00:53,  3.33it/s]Matching cardholders:  46%|████▋     | 152/328 [00:47<00:54,  3.22it/s]Matching cardholders:  47%|████▋     | 153/328 [00:47<00:52,  3.37it/s]Matching cardholders:  47%|████▋     | 154/328 [00:47<00:52,  3.31it/s]Matching cardholders:  47%|████▋     | 155/328 [00:48<00:54,  3.19it/s]Matching cardholders:  48%|████▊     | 156/328 [00:48<00:55,  3.11it/s]Matching cardholders:  48%|████▊     | 157/328 [00:48<00:55,  3.08it/s]Matching cardholders:  48%|████▊     | 158/328 [00:49<00:54,  3.12it/s]Matching cardholders:  48%|████▊     | 159/328 [00:49<00:52,  3.21it/s]Matching cardholders:  49%|████▉     | 160/328 [00:49<00:51,  3.29it/s]Matching cardholders:  49%|████▉     | 161/328 [00:50<00:53,  3.09it/s]Matching cardholders:  49%|████▉     | 162/328 [00:50<00:52,  3.15it/s]Matching cardholders:  50%|████▉     | 163/328 [00:50<00:50,  3.29it/s]Matching cardholders:  50%|█████     | 164/328 [00:50<00:51,  3.19it/s]Matching cardholders:  50%|█████     | 165/328 [00:51<00:51,  3.14it/s]Matching cardholders:  51%|█████     | 166/328 [00:51<00:57,  2.81it/s]Matching cardholders:  51%|█████     | 167/328 [00:52<00:55,  2.91it/s]Matching cardholders:  51%|█████     | 168/328 [00:52<00:54,  2.95it/s]Matching cardholders:  52%|█████▏    | 169/328 [00:52<00:53,  2.96it/s]Matching cardholders:  52%|█████▏    | 170/328 [00:53<00:52,  2.98it/s]Matching cardholders:  52%|█████▏    | 171/328 [00:53<00:52,  3.01it/s]Matching cardholders:  52%|█████▏    | 172/328 [00:53<00:51,  3.03it/s]Matching cardholders:  53%|█████▎    | 173/328 [00:53<00:48,  3.20it/s]Matching cardholders:  53%|█████▎    | 174/328 [00:54<00:46,  3.31it/s]Matching cardholders:  53%|█████▎    | 175/328 [00:54<00:47,  3.19it/s]Matching cardholders:  54%|█████▎    | 176/328 [00:54<00:48,  3.13it/s]Matching cardholders:  54%|█████▍    | 177/328 [00:55<00:48,  3.08it/s]Matching cardholders:  54%|█████▍    | 178/328 [00:55<00:48,  3.07it/s]Matching cardholders:  55%|█████▍    | 179/328 [00:55<00:48,  3.06it/s]Matching cardholders:  55%|█████▍    | 180/328 [00:56<00:48,  3.03it/s]Matching cardholders:  55%|█████▌    | 181/328 [00:56<00:46,  3.16it/s]Matching cardholders:  55%|█████▌    | 182/328 [00:56<00:45,  3.22it/s]Matching cardholders:  56%|█████▌    | 183/328 [00:57<00:46,  3.14it/s]Matching cardholders:  56%|█████▌    | 184/328 [00:57<00:46,  3.08it/s]Matching cardholders:  56%|█████▋    | 185/328 [00:57<00:46,  3.09it/s]Matching cardholders:  57%|█████▋    | 186/328 [00:58<00:45,  3.12it/s]Matching cardholders:  57%|█████▋    | 187/328 [00:58<00:45,  3.07it/s]Matching cardholders:  57%|█████▋    | 188/328 [00:58<00:46,  3.04it/s]Matching cardholders:  58%|█████▊    | 189/328 [00:59<00:48,  2.86it/s]Matching cardholders:  58%|█████▊    | 190/328 [00:59<00:47,  2.89it/s]Matching cardholders:  58%|█████▊    | 191/328 [00:59<00:48,  2.82it/s]Matching cardholders:  59%|█████▊    | 192/328 [01:00<00:49,  2.77it/s]Matching cardholders:  59%|█████▉    | 193/328 [01:00<00:46,  2.90it/s]Matching cardholders:  59%|█████▉    | 194/328 [01:00<00:44,  3.02it/s]Matching cardholders:  59%|█████▉    | 195/328 [01:01<00:44,  3.01it/s]Matching cardholders:  60%|█████▉    | 196/328 [01:01<00:43,  3.07it/s]Matching cardholders:  60%|██████    | 197/328 [01:01<00:42,  3.12it/s]Matching cardholders:  60%|██████    | 198/328 [01:02<00:42,  3.07it/s]Matching cardholders:  61%|██████    | 199/328 [01:02<00:41,  3.13it/s]Matching cardholders:  61%|██████    | 200/328 [01:02<00:41,  3.10it/s]Matching cardholders:  61%|██████▏   | 201/328 [01:03<00:41,  3.06it/s]Matching cardholders:  62%|██████▏   | 202/328 [01:03<00:40,  3.11it/s]Matching cardholders:  62%|██████▏   | 203/328 [01:03<00:40,  3.07it/s]Matching cardholders:  62%|██████▏   | 204/328 [01:04<00:40,  3.03it/s]Matching cardholders:  62%|██████▎   | 205/328 [01:04<00:41,  2.98it/s]Matching cardholders:  63%|██████▎   | 206/328 [01:04<00:40,  2.98it/s]Matching cardholders:  63%|██████▎   | 207/328 [01:05<00:40,  2.99it/s]Matching cardholders:  63%|██████▎   | 208/328 [01:05<00:38,  3.08it/s]Matching cardholders:  64%|██████▎   | 209/328 [01:05<00:39,  3.05it/s]Matching cardholders:  64%|██████▍   | 210/328 [01:06<00:38,  3.04it/s]Matching cardholders:  64%|██████▍   | 211/328 [01:06<00:38,  3.02it/s]Matching cardholders:  65%|██████▍   | 212/328 [01:06<00:41,  2.80it/s]Matching cardholders:  65%|██████▍   | 213/328 [01:07<00:40,  2.85it/s]Matching cardholders:  65%|██████▌   | 214/328 [01:07<00:39,  2.87it/s]Matching cardholders:  66%|██████▌   | 215/328 [01:07<00:38,  2.90it/s]Matching cardholders:  66%|██████▌   | 216/328 [01:08<00:38,  2.93it/s]Matching cardholders:  66%|██████▌   | 217/328 [01:08<00:37,  2.93it/s]Matching cardholders:  66%|██████▋   | 218/328 [01:08<00:37,  2.97it/s]Matching cardholders:  67%|██████▋   | 219/328 [01:09<00:35,  3.03it/s]Matching cardholders:  67%|██████▋   | 220/328 [01:09<00:35,  3.00it/s]Matching cardholders:  67%|██████▋   | 221/328 [01:09<00:35,  2.99it/s]Matching cardholders:  68%|██████▊   | 222/328 [01:10<00:34,  3.03it/s]Matching cardholders:  68%|██████▊   | 223/328 [01:10<00:34,  3.08it/s]Matching cardholders:  68%|██████▊   | 224/328 [01:10<00:34,  3.03it/s]Matching cardholders:  69%|██████▊   | 225/328 [01:11<00:34,  3.01it/s]Matching cardholders:  69%|██████▉   | 226/328 [01:11<00:34,  2.99it/s]Matching cardholders:  69%|██████▉   | 227/328 [01:11<00:33,  2.98it/s]Matching cardholders:  70%|██████▉   | 228/328 [01:12<00:33,  2.97it/s]Matching cardholders:  70%|██████▉   | 229/328 [01:12<00:32,  3.02it/s]Matching cardholders:  70%|███████   | 230/328 [01:12<00:32,  3.00it/s]Matching cardholders:  70%|███████   | 231/328 [01:13<00:32,  2.99it/s]Matching cardholders:  71%|███████   | 232/328 [01:13<00:32,  2.99it/s]Matching cardholders:  71%|███████   | 233/328 [01:13<00:32,  2.91it/s]Matching cardholders:  71%|███████▏  | 234/328 [01:14<00:32,  2.92it/s]Matching cardholders:  72%|███████▏  | 235/328 [01:14<00:35,  2.65it/s]Matching cardholders:  72%|███████▏  | 236/328 [01:15<00:33,  2.73it/s]Matching cardholders:  72%|███████▏  | 237/328 [01:15<00:32,  2.81it/s]Matching cardholders:  73%|███████▎  | 238/328 [01:15<00:31,  2.85it/s]Matching cardholders:  73%|███████▎  | 239/328 [01:16<00:30,  2.88it/s]Matching cardholders:  73%|███████▎  | 240/328 [01:16<00:30,  2.90it/s]Matching cardholders:  73%|███████▎  | 241/328 [01:16<00:29,  2.92it/s]Matching cardholders:  74%|███████▍  | 242/328 [01:17<00:29,  2.94it/s]Matching cardholders:  74%|███████▍  | 243/328 [01:17<00:28,  2.95it/s]Matching cardholders:  74%|███████▍  | 244/328 [01:17<00:28,  2.96it/s]Matching cardholders:  75%|███████▍  | 245/328 [01:18<00:28,  2.96it/s]Matching cardholders:  75%|███████▌  | 246/328 [01:18<00:27,  2.97it/s]Matching cardholders:  75%|███████▌  | 247/328 [01:18<00:27,  2.96it/s]Matching cardholders:  76%|███████▌  | 248/328 [01:19<00:27,  2.96it/s]Matching cardholders:  76%|███████▌  | 249/328 [01:19<00:26,  2.98it/s]Matching cardholders:  76%|███████▌  | 250/328 [01:19<00:26,  2.97it/s]Matching cardholders:  77%|███████▋  | 251/328 [01:20<00:25,  2.97it/s]Matching cardholders:  77%|███████▋  | 252/328 [01:20<00:25,  2.97it/s]Matching cardholders:  77%|███████▋  | 253/328 [01:20<00:25,  2.96it/s]Matching cardholders:  77%|███████▋  | 254/328 [01:21<00:24,  2.96it/s]Matching cardholders:  78%|███████▊  | 255/328 [01:21<00:24,  2.96it/s]Matching cardholders:  78%|███████▊  | 256/328 [01:21<00:24,  2.95it/s]Matching cardholders:  78%|███████▊  | 257/328 [01:22<00:24,  2.95it/s]Matching cardholders:  79%|███████▊  | 258/328 [01:22<00:25,  2.71it/s]Matching cardholders:  79%|███████▉  | 259/328 [01:22<00:24,  2.77it/s]Matching cardholders:  79%|███████▉  | 260/328 [01:23<00:24,  2.83it/s]Matching cardholders:  80%|███████▉  | 261/328 [01:23<00:23,  2.88it/s]Matching cardholders:  80%|███████▉  | 262/328 [01:23<00:22,  2.90it/s]Matching cardholders:  80%|████████  | 263/328 [01:24<00:22,  2.92it/s]Matching cardholders:  80%|████████  | 264/328 [01:24<00:22,  2.90it/s]Matching cardholders:  81%|████████  | 265/328 [01:24<00:21,  2.91it/s]Matching cardholders:  81%|████████  | 266/328 [01:25<00:21,  2.93it/s]Matching cardholders:  81%|████████▏ | 267/328 [01:25<00:20,  2.94it/s]Matching cardholders:  82%|████████▏ | 268/328 [01:25<00:20,  2.94it/s]Matching cardholders:  82%|████████▏ | 269/328 [01:26<00:19,  2.95it/s]Matching cardholders:  82%|████████▏ | 270/328 [01:26<00:19,  2.96it/s]Matching cardholders:  83%|████████▎ | 271/328 [01:26<00:19,  2.95it/s]Matching cardholders:  83%|████████▎ | 272/328 [01:27<00:18,  2.96it/s]Matching cardholders:  83%|████████▎ | 273/328 [01:27<00:18,  2.96it/s]Matching cardholders:  84%|████████▎ | 274/328 [01:27<00:18,  2.96it/s]Matching cardholders:  84%|████████▍ | 275/328 [01:28<00:17,  2.96it/s]Matching cardholders:  84%|████████▍ | 276/328 [01:28<00:17,  2.97it/s]Matching cardholders:  84%|████████▍ | 277/328 [01:28<00:17,  2.96it/s]Matching cardholders:  85%|████████▍ | 278/328 [01:29<00:16,  2.96it/s]Matching cardholders:  85%|████████▌ | 279/328 [01:29<00:16,  2.95it/s]Matching cardholders:  85%|████████▌ | 280/328 [01:30<00:16,  2.95it/s]Matching cardholders:  86%|████████▌ | 281/328 [01:30<00:17,  2.71it/s]Matching cardholders:  86%|████████▌ | 282/328 [01:30<00:16,  2.77it/s]Matching cardholders:  86%|████████▋ | 283/328 [01:31<00:15,  2.83it/s]Matching cardholders:  87%|████████▋ | 284/328 [01:31<00:15,  2.87it/s]Matching cardholders:  87%|████████▋ | 285/328 [01:31<00:14,  2.89it/s]Matching cardholders:  87%|████████▋ | 286/328 [01:32<00:14,  2.91it/s]Matching cardholders:  88%|████████▊ | 287/328 [01:32<00:14,  2.93it/s]Matching cardholders:  88%|████████▊ | 288/328 [01:32<00:13,  2.93it/s]Matching cardholders:  88%|████████▊ | 289/328 [01:33<00:13,  2.94it/s]Matching cardholders:  88%|████████▊ | 290/328 [01:33<00:12,  2.95it/s]Matching cardholders:  89%|████████▊ | 291/328 [01:33<00:12,  2.94it/s]Matching cardholders:  89%|████████▉ | 292/328 [01:34<00:12,  2.94it/s]Matching cardholders:  89%|████████▉ | 293/328 [01:34<00:11,  2.95it/s]Matching cardholders:  90%|████████▉ | 294/328 [01:34<00:11,  2.94it/s]Matching cardholders:  90%|████████▉ | 295/328 [01:35<00:11,  2.95it/s]Matching cardholders:  90%|█████████ | 296/328 [01:35<00:10,  2.96it/s]Matching cardholders:  91%|█████████ | 297/328 [01:35<00:10,  2.95it/s]Matching cardholders:  91%|█████████ | 298/328 [01:36<00:10,  2.95it/s]Matching cardholders:  91%|█████████ | 299/328 [01:36<00:09,  2.95it/s]Matching cardholders:  91%|█████████▏| 300/328 [01:36<00:09,  2.95it/s]Matching cardholders:  92%|█████████▏| 301/328 [01:37<00:09,  2.96it/s]Matching cardholders:  92%|█████████▏| 302/328 [01:37<00:08,  2.96it/s]Matching cardholders:  92%|█████████▏| 303/328 [01:37<00:08,  2.95it/s]Matching cardholders:  93%|█████████▎| 304/328 [01:38<00:08,  2.71it/s]Matching cardholders:  93%|█████████▎| 305/328 [01:38<00:08,  2.78it/s]Matching cardholders:  93%|█████████▎| 306/328 [01:39<00:07,  2.82it/s]Matching cardholders:  94%|█████████▎| 307/328 [01:39<00:07,  2.87it/s]Matching cardholders:  94%|█████████▍| 308/328 [01:39<00:06,  2.90it/s]Matching cardholders:  94%|█████████▍| 309/328 [01:40<00:06,  2.91it/s]Matching cardholders:  95%|█████████▍| 310/328 [01:40<00:06,  2.93it/s]Matching cardholders:  95%|█████████▍| 311/328 [01:40<00:05,  2.94it/s]Matching cardholders:  95%|█████████▌| 312/328 [01:41<00:05,  2.94it/s]Matching cardholders:  95%|█████████▌| 313/328 [01:41<00:05,  2.95it/s]Matching cardholders:  96%|█████████▌| 314/328 [01:41<00:04,  2.95it/s]Matching cardholders:  96%|█████████▌| 315/328 [01:42<00:04,  2.95it/s]Matching cardholders:  96%|█████████▋| 316/328 [01:42<00:04,  2.95it/s]Matching cardholders:  97%|█████████▋| 317/328 [01:42<00:03,  2.97it/s]Matching cardholders:  97%|█████████▋| 318/328 [01:43<00:03,  2.96it/s]Matching cardholders:  97%|█████████▋| 319/328 [01:43<00:03,  2.96it/s]Matching cardholders:  98%|█████████▊| 320/328 [01:43<00:02,  2.89it/s]Matching cardholders:  98%|█████████▊| 321/328 [01:44<00:02,  2.88it/s]Matching cardholders:  98%|█████████▊| 322/328 [01:44<00:02,  2.91it/s]Matching cardholders:  98%|█████████▊| 323/328 [01:44<00:01,  2.92it/s]Matching cardholders:  99%|█████████▉| 324/328 [01:45<00:01,  2.92it/s]Matching cardholders:  99%|█████████▉| 325/328 [01:45<00:01,  2.94it/s]Matching cardholders:  99%|█████████▉| 326/328 [01:45<00:00,  2.95it/s]Matching cardholders: 100%|█████████▉| 327/328 [01:46<00:00,  2.69it/s]Matching cardholders: 100%|██████████| 328/328 [01:46<00:00,  2.77it/s]Matching cardholders: 100%|██████████| 328/328 [01:46<00:00,  3.08it/s]
Percentage of clients with card issued: 8.33%
Percentage of clients with card issued after matching: 16.67%

After each non-cardholder got the artifical card issued date assigned we drop the remaining non-cardholders without a match.

before_len = len(matched_non_card_holders_w_issue_date_df)
print(-(before_len - len(matched_non_card_holders_w_issue_date_df)))
matched_non_card_holders_w_issue_date_df = (
    matched_non_card_holders_w_issue_date_df.dropna(subset=["card_issued"])
)
data_reduction["Non-cardholders without match"] = -(
    before_len - len(matched_non_card_holders_w_issue_date_df)
)
del before_len
0

4.3 Aggregate on a Monthly Basis

After matching cardholders with non-cardholders and setting artificial card issue dates, we aggregate the transactional data on a monthly basis. This aggregation provides a comprehensive overview of financial activities for each account, facilitating further model development providing us with a fixed of features to work with.

The function aggregate_transactions_monthly is designed to process and summarize financial transactions on a monthly basis for each account within a dataset. The explanation of its workings, step by step, is as follows:

  1. Sorting Transactions: Initially, the function sorts the transactions in the provided DataFrame transactions_df based on account_id and the transaction date. This ensures that all transactions for a given account are ordered chronologically, which is crucial for accurate monthly aggregation and cumulative balance calculation.

  2. Monthly Grouping: Each transaction’s date is then converted to a monthly period using dt.to_period("M"). This step categorizes each transaction by the month and year it occurred, facilitating the aggregation of transactions on a monthly basis.

  3. Aggregation of Monthly Data: The function groups the sorted transactions by account_id and the newly created month column. For each group, it calculates several metrics:

    • volume: The sum of all transactions’ amounts for the month, representing the total money flow.
    • total_abs_amount: The sum of the absolute values of the transactions’ amounts, indicating the total amount of money moved, disregarding the direction.
    • transaction_count: The count of transactions, providing a sense of activity level.
    • positive_transaction_count and negative_transaction_count: The counts of positive (inflows) and negative (outflows) transactions, respectively. This distinction can help identify the balance between income and expenses.
    • Statistical measures like average_amount, median_amount, min_amount, max_amount, and std_amount offer insights into the distribution of transaction amounts.
    • type_count, operation_count, and k_symbol_count: The counts of unique transaction types, operations, and transaction symbols (k_symbol), respectively, indicating the diversity of transaction characteristics.
  4. Cumulative Balance Calculation: After aggregating the monthly data, the function computes a cumulative balance (balance) for each account by cumulatively summing the volume (total transaction amount) over time. This step provides insight into how the account balance evolves over the months.

As we have already explored and verified in the EDA section of the transactional data, each account starts with a transaction where the amount equals the inital balance. This validation ensures the integrity of the aggregated data, as the balance should accurately reflect the total transaction volume over time.

def aggregate_transactions_monthly(df):
    """
    Aggregate financial transaction data on a monthly basis per account.

    Parameters:
    - df (pd.DataFrame): DataFrame containing financial transaction data with 'account_id', 'date', and other relevant columns.

    - validate (bool): If True, validate the aggregated data. Default is True.

    Returns:
    - pd.DataFrame: Monthly aggregated financial transaction data per account.
    """
    df_sorted = df.sort_values(by=["account_id", "date"])
    df_sorted["month"] = df_sorted["date"].dt.to_period("M")

    monthly_aggregated_data = (
        df_sorted.groupby(["account_id", "month"])
        .agg(
            volume=("amount", "sum"),
            total_abs_amount=("amount", lambda x: x.abs().sum()),
            transaction_count=("amount", "count"),
            positive_transaction_count=(
                "amount",
                lambda x: (x >= 0).sum(),
            ),  # TODO: it seems that there are some transactions with 0 amount, how to handle those?
            negative_transaction_count=("amount", lambda x: (x < 0).sum()),
            average_amount=("amount", "mean"),
            median_amount=("amount", "median"),
            min_amount=("amount", "min"),
            max_amount=("amount", "max"),
            std_amount=("amount", "std"),
            type_count=("transaction_type", "nunique"),
            operation_count=("operation", "nunique"),
            k_symbol_count=("k_symbol", "nunique"),
        )
        .reset_index()
        .sort_values(by=["account_id", "month"])
    )

    monthly_aggregated_data["balance"] = monthly_aggregated_data.groupby("account_id")[
        "volume"
    ].cumsum()
    return monthly_aggregated_data


agg_transactions_monthly_df = aggregate_transactions_monthly(transactions_df)
agg_transactions_monthly_df.to_csv("./data/agg_transactions_monthly.csv", index=False)
agg_transactions_monthly_df.describe()
account_id volume total_abs_amount transaction_count positive_transaction_count negative_transaction_count average_amount median_amount min_amount max_amount std_amount type_count operation_count k_symbol_count balance
count 185057.000000 185057.000000 185057.000000 185057.000000 185057.000000 185057.000000 185057.000000 185057.000000 185057.000000 185057.000000 176803.000000 185057.000000 185057.000000 185057.000000 185057.000000
mean 2799.983292 1065.354397 33815.492309 5.708079 2.189017 3.519062 451.659265 -372.421445 -9607.378249 14756.009580 9030.305445 1.921181 3.568965 3.719649 34474.787632
std 2331.861909 12509.136299 37724.985550 2.417842 0.726115 2.173427 2479.100575 1933.445907 10746.883348 12958.692736 7402.806514 0.269457 0.832363 1.085701 19799.443508
min 1.000000 -101550.300000 14.600000 1.000000 0.000000 0.000000 -37000.000000 -37000.000000 -87400.000000 -37000.000000 0.000000 1.000000 1.000000 1.000000 -41125.800000
25% 1172.000000 -2266.600000 9659.500000 4.000000 2.000000 2.000000 -379.566667 -785.000000 -13428.000000 4756.000000 3283.937059 2.000000 3.000000 3.000000 20405.600000
50% 2375.000000 1058.100000 22933.100000 5.000000 2.000000 3.000000 220.260000 -14.600000 -6177.000000 10929.000000 6824.369949 2.000000 4.000000 4.000000 30000.000000
75% 3576.000000 4132.200000 43668.000000 7.000000 2.000000 5.000000 878.680000 44.700000 -2672.000000 21553.000000 12622.945077 2.000000 4.000000 4.000000 44540.500000
max 11382.000000 115038.200000 609736.200000 23.000000 9.000000 16.000000 44708.000000 44708.000000 44708.000000 74812.000000 57782.701468 2.000000 6.000000 7.000000 138317.800000

The validate_monthly_aggregated_transactions function is invoked to ensure the integrity and correctness of the aggregated data through several assertions:

  • The balance should consistently increase or decrease based on whether the total monthly transaction volume is positive or negative, respectively.
  • For each account, the balance in the first month should equal the total transaction volume of that month.
  • The sum of positive and negative transaction counts must equal the total transaction count for each month.
  • The number of unique accounts in the aggregated data should match that in the original dataset.
  • The final balances of accounts in the aggregated data should closely match their last recorded transactions in the original dataset.
def validate_monthly_aggregated_transactions(aggregated_data, original_df):
    """
    Validate the integrity and correctness of aggregated monthly financial transactions.

    Parameters:
    - aggregated_data (pd.DataFrame): Aggregated monthly transaction data.
    - original_df (pd.DataFrame): Original dataset of financial transactions.

    Raises:
    - AssertionError: If validation conditions are not met.
    """

    assert (aggregated_data["volume"] >= 0).all() == (
        aggregated_data["balance"].diff() >= 0
    ).all(), "If the total amount is positive, the balance should go up."

    assert (aggregated_data["volume"] < 0).all() == (
        aggregated_data["balance"].diff() < 0
    ).all(), "If the total amount is negative, the balance should go down."

    first_month = aggregated_data.groupby("account_id").nth(0)
    assert (
        first_month["volume"] == first_month["balance"]
    ).all(), "The balance should equal the volume for the first month."

    assert (
        aggregated_data["positive_transaction_count"]
        + aggregated_data["negative_transaction_count"]
        == aggregated_data["transaction_count"]
    ).all(), "The sum of positive and negative transaction counts should equal the total transaction count."

    assert (
        aggregated_data["account_id"].nunique() == original_df["account_id"].nunique()
    ), "The number of unique account_ids in the aggregated DataFrame should be the same as the original DataFrame."

    assert (
        pd.merge(
            aggregated_data.groupby("account_id")
            .last()
            .reset_index()[["account_id", "balance"]],
            original_df[
                original_df.groupby("account_id")["date"].transform("max")
                == original_df["date"]
            ][["account_id", "balance"]],
            on="account_id",
            suffixes=("_final", "_last"),
        )
        .apply(
            lambda x: np.isclose(x["balance_final"], x["balance_last"], atol=5), axis=1
        )
        .any()
    ), "Some accounts' final balances do not match their last transactions."


validate_monthly_aggregated_transactions(agg_transactions_monthly_df, transactions_df)

5 Exploratory Data Analysis: Aggregated Monthly Transactions

5.1 Monthly Balance Difference and Volume

This plot gives a clear picture of how money moves in and out of an account each month and how these movements affect the overall balance. It does this by showing two things:

  • Balance Difference: This line shows whether the account balance went up or down each month. If the line goes up, it means the account gained money that month. If it goes down, the account lost money.
  • Volume: This line shows the total amount of money that moved in the account each month, regardless of whether it was coming in or going out.

What to Look For: - A direct link between the amount of money moved (volume) and changes in the account balance. High incoming money should lead to an uptick in the balance, and lots of outgoing money should lead to a downturn. - This visual check helps to understand how active the account is and whether it’s generally getting fuller or emptier over time.

def plot_monthly_balance_diff_and_volume(
    transactions_monthly, account_id 
):
    account_transactions = transactions_monthly[
        transactions_monthly["account_id"] == account_id
    ].sort_values(by="month")
    account_transactions["balance_diff"] = account_transactions["balance"].diff()

    plt.figure(figsize=(9.5, 6))

    plt.plot(
        account_transactions["month"].astype(str),
        account_transactions["balance_diff"],
        marker="o",
        label="Balance Difference",
    )
    plt.plot(
        account_transactions["month"].astype(str),
        account_transactions["volume"],
        marker="x",
        linestyle="--",
        label="Volume",
    )

    plt.title(f"Monthly Balance Difference and Volume for Account {account_id}")
    plt.xlabel("Month")
    plt.ylabel("Value")
    plt.xticks(rotation=90, fontsize=7)
    plt.yticks(fontsize=8)
    plt.legend()
    plt.grid(True)
    plt.show()


plot_monthly_balance_diff_and_volume(agg_transactions_monthly_df, 2)

5.2 Monthly Transactions, Balance, and Volume Plot Explanation

This visualization offers a snapshot of an account’s activity over time by comparing money movement each month with the overall account balance. It helps to understand:

  • Volume: How much money came in or went out of the account each month. Incoming money is shown as up, and outgoing money as down.
  • Balance: The total money in the account at the end of each month, showing how it’s changed over time due to the monthly transactions.

What to Look For: - How the monthly money movement impacts the account’s growing or shrinking balance. For example, a few months of high income should visibly increase the balance. - This simple visual guide helps spot trends, like if the account is steadily growing, holding steady, or facing issues, giving quick insights into financial well-being and further validates the aggregation made in the previous step.

def plot_monthly_transactions_balance_and_volume(agg_transactions_monthly, account_id):
    account_transactions = agg_transactions_monthly[
        agg_transactions_monthly["account_id"] == account_id
    ]

    plt.figure(figsize=(9.5, 6))

    plt.plot(
        account_transactions["month"].astype(str),
        account_transactions["volume"],
        marker="o",
        label="Volume",
    )
    plt.plot(
        account_transactions["month"].astype(str),
        account_transactions["balance"],
        marker="x",
        linestyle="--",
        label="Balance",
    )

    plt.title(f"Monthly Transactions and Balance for Account {account_id}")
    plt.xlabel("Month")
    plt.ylabel("Value")
    plt.xticks(rotation=90, fontsize=7)
    plt.yticks(fontsize=8)
    plt.legend()
    plt.grid(True)
    plt.show()


plot_monthly_transactions_balance_and_volume(agg_transactions_monthly_df, 2)

5.3 Delieverable: Closer Look at Account 14

plot_monthly_transactions_balance_and_volume(agg_transactions_monthly_df, 14)

Account 14 shows a rather conservative transaction history. The spending habits are all withing range of 10k to -10k per month. We can see little volatility, the account shows a slight trend of growing.

5.4 Delieverable: Closer Look at Account 18

plot_monthly_transactions_balance_and_volume(agg_transactions_monthly_df, 18)

Account 18 paints a different picture in comparison to account 14.

The volatility here is a lot higher, indiciating a potential for a business account or high income household. Especially March 1994 to December 1994 show some volatile transaction habits.

Looking at the balance and volume per month for the accounts 14 and 18 we can notice some interesting patterns.

TODO: Add analysis

6 Pivot Transactions: Rolling Up to Monthly Aggregates

We have condensed transaction data into a monthly aggregated format. This aggregation serves a multifaceted purpose:

  • Monthly aggregation standardizes the time frame across which we analyze transactions, allowing us to compare transactional behaviors consistently across all accounts.
  • Aggregating data on a monthly level illuminates patterns that daily data might obscure. It enables us to discern trends over a broader time scale, capturing cyclical behaviors, seasonal effects, and response to macroeconomic events.
  • Daily transaction data can be “noisy” with random fluctuations. By considering monthly totals and averages, we reduce this noise, revealing underlying trends more clearly.
  • Our primary objective is to understand behaviors leading up to the issuance of a card. Aggregating transactions on a monthly basis helps focus on the crucial period preceding card issuance, enabling us to correlate transactional behaviors with the propensity to become a cardholder.
def pivot_transactions(
    non_transactional, transactions_monthly, months_before_card_range=(2, 13)
):
    """
    Aggregate monthly transaction data and merge it with non-transactional account data,
    focusing on the time frame leading up to the card issuance.

    This function merges monthly transaction data with non-transactional data to associate each
    transaction with the respective account and card issued date. It then filters transactions based
    on a specified range of months before card issuance and aggregates various transaction metrics.

    Parameters:
    - non_transactional (pd.DataFrame): A DataFrame containing non-transactional account data. This is only used to map card issuance dates to transactions.
    - transactions_monthly (pd.DataFrame): A DataFrame containing monthly transaction data.
    - months_before_card_range (tuple): A tuple specifying the inclusive range of months before card
                                        issuance to filter the transactions for aggregation.

    The aggregation includes the sum of volume and transaction counts, as well as the mean and other
    statistical measures of transaction amounts, for each account within the specified months before
    card issuance.

    The resulting DataFrame is pivoted to have 'account_id' as rows and the months before card
    issuance as columns, with aggregated metrics as values. Column names are constructed to
    describe the month and the metric represented.

    Returns:
    - pd.DataFrame: The final aggregated and pivoted dataset ready for analysis, with each row
                    representing an account and each column a specific metric in the months before
                    card issuance.
    """
    merged_df = transactions_monthly.merge(
        non_transactional[["account_id"]], on="account_id"
    )

    merged_df["card_issued_date"] = merged_df["account_id"].map(
        non_transactional.set_index("account_id")["card_issued"]
    )
    merged_df["months_before_card"] = merged_df.apply(
        lambda row: (row["card_issued_date"].to_period("M") - row["month"]).n, axis=1
    )

    start_month, end_month = months_before_card_range
    filtered_df = merged_df.query(f"{start_month} <= months_before_card <= {end_month}")

    aggregated_data = (
        filtered_df.groupby(["account_id", "months_before_card"])
        .agg(
            {
                "volume": "sum",
                "total_abs_amount": "sum",
                "transaction_count": "sum",
                "positive_transaction_count": "sum",
                "negative_transaction_count": "sum",
                "average_amount": "mean",
                "median_amount": "median",
                "min_amount": "min",
                "max_amount": "max",
                "std_amount": "std",
                "type_count": "sum",
                "operation_count": "sum",
                "k_symbol_count": "sum",
                "balance": "mean",
            }
        )
        .reset_index()
    )

    pivoted_data = aggregated_data.pivot(
        index="account_id", columns="months_before_card"
    )
    pivoted_data.columns = [
        "_".join(["M", str(col[1]), col[0]]) for col in pivoted_data.columns.values
    ]

    final_dataset = pivoted_data.reset_index()
    return final_dataset


transactions_pivoted_df = pivot_transactions(
    matched_non_card_holders_w_issue_date_df, agg_transactions_monthly_df
)
transactions_pivoted_df.describe()
account_id M_2_volume M_3_volume M_4_volume M_5_volume M_6_volume M_7_volume M_8_volume M_9_volume M_10_volume ... M_4_balance M_5_balance M_6_balance M_7_balance M_8_balance M_9_balance M_10_balance M_11_balance M_12_balance M_13_balance
count 656.000000 656.000000 656.000000 656.000000 656.000000 656.000000 656.000000 656.000000 656.000000 656.000000 ... 656.000000 656.000000 656.000000 656.000000 656.000000 656.000000 656.000000 655.000000 656.000000 656.000000
mean 2824.231707 -188.081402 2131.023476 -83.378354 1166.012652 -662.276982 1418.670732 1039.067683 1139.186890 -633.459299 ... 43713.226677 43796.605030 42630.592378 43292.869360 41874.198628 40835.130945 39695.944055 40352.773588 39736.136585 39596.693293
std 2370.634460 12069.931426 13667.115448 13070.792019 14518.623820 13857.280237 14971.381727 13142.470175 12976.711696 15761.581790 ... 20210.049464 21178.364095 20094.254246 20971.757649 20486.289111 19711.803414 19561.774856 20190.800543 19130.114948 19766.034091
min 11.000000 -76779.500000 -54322.500000 -69155.200000 -62718.000000 -67190.700000 -62113.900000 -84970.900000 -75013.500000 -66945.600000 ... 1762.200000 -4575.900000 677.800000 -8789.700000 -1299.300000 -7.900000 -3269.700000 820.700000 -9843.200000 192.000000
25% 1146.750000 -2860.275000 -2118.800000 -2363.675000 -3044.425000 -3524.050000 -2881.525000 -1730.000000 -2057.200000 -3582.750000 ... 26655.925000 26337.725000 26830.275000 26528.400000 25485.675000 25222.525000 24919.950000 24950.750000 25026.475000 24524.575000
50% 2330.500000 601.850000 1345.850000 1163.600000 1196.400000 841.700000 1198.750000 1328.200000 1105.650000 890.900000 ... 42680.500000 42269.700000 41417.900000 41245.050000 39681.050000 38903.600000 36803.850000 37070.200000 37223.300000 36958.000000
75% 3666.000000 3583.525000 4677.725000 4250.250000 4512.375000 4456.775000 4391.850000 4530.200000 4713.650000 3944.450000 ... 54942.825000 55514.200000 54229.400000 56922.150000 53359.175000 53476.650000 51744.600000 50787.050000 50391.900000 49328.125000
max 11382.000000 57541.800000 76136.000000 60272.100000 69456.400000 65912.700000 88209.200000 55601.500000 72059.700000 98041.500000 ... 105453.800000 112335.700000 106459.000000 111264.300000 106472.800000 108358.900000 132286.300000 108202.400000 112159.300000 111050.900000

8 rows × 169 columns

7 Merge everything together

golden_record_df = matched_non_card_holders_w_issue_date_df.merge(
    transactions_pivoted_df, on="account_id", how="left"
)
golden_record_df.to_csv("data/golden_record.csv", index=False)
data_reduction["Final Golden Record"] = len(golden_record_df)

assert golden_record_df[
    "client_id"
].is_unique, "Each client_id should appear exactly once in the final DataFrame."
assert golden_record_df[
    "account_id"
].is_unique, "Each account_id should appear exactly once in the final DataFrame."

golden_record_df.head()
account_id account_district_id account_frequency account_created account_district_name account_region account_inhabitants account_small_municipalities account_medium_municipalities account_large_municipalities ... M_4_balance M_5_balance M_6_balance M_7_balance M_8_balance M_9_balance M_10_balance M_11_balance M_12_balance M_13_balance
0 576 55 MONTHLY_ISSUANCE 1993-01-01 Brno - venkov south Moravia 157042 49 70 18 ... 35433.9 32763.3 30103.6 27455.2 39623.2 41346.8 40646.7 37953.8 35272.1 34357.8
1 3818 74 MONTHLY_ISSUANCE 1993-01-01 Ostrava - mesto north Moravia 323870 0 0 0 ... 32448.7 20928.3 48812.7 45898.7 38802.0 41537.8 30863.9 49023.8 44879.5 41579.8
2 704 55 MONTHLY_ISSUANCE 1993-01-01 Brno - venkov south Moravia 157042 49 70 18 ... 49756.6 35452.9 44789.8 33259.4 19026.1 49039.4 31829.9 35224.0 46092.2 31803.3
3 1695 76 MONTHLY_ISSUANCE 1993-01-03 Sumperk north Moravia 127369 31 32 13 ... 98674.9 97326.2 105257.0 68223.8 101761.7 62686.1 55853.1 81933.7 79168.4 99457.9
4 2379 44 MONTHLY_ISSUANCE 1993-01-10 Chrudim east Bohemia 105606 77 26 7 ... 20854.4 20774.5 19017.2 17267.0 15524.3 19539.7 15254.2 14955.0 13221.3 30056.2

5 rows × 232 columns

Looking at the first few rows of the final golden record, we can see the aggregated transactional data for each account, with columns representing various metrics for each month leading up to the card issuance date.

plt.figure()
plt.title("Number of Clients by Card Issuance Status")
sns.countplot(x="has_card", data=golden_record_df)
plt.xlabel("Card Issued")
plt.ylabel("Count")
plt.show()

We can see that the number of clients with a card issued is equal to the number of clients without a card issued, indicating a successful matching process.

plt.figure()
plt.title("Distribution of Card Issuance Dates")
sns.histplot(
    golden_record_df, x="card_issued", hue="has_card", kde=True, bins=30, alpha=0.5
)
plt.xlabel("Card Issuance Date")
plt.ylabel("Count")
plt.show()

The distribution of card issuance dates shows that the card issuance process was spread out over time, with an expected identical distribution for clients with and without cards issued.

8 Data Reduction Summary

The following waterfall chart visualizes the data reduction process, highlighting the number of records retained or lost at each stage.

import plotly.graph_objects as go

data_reduction_df = pd.DataFrame(
    list(data_reduction.items()), columns=["Category", "Amount"]
)
colors = ["skyblue" if amt >= 0 else "orange" for amt in data_reduction_df["Amount"]]

fig = go.Figure(
    go.Waterfall(
        name="20",
        orientation="v",
        measure=["relative"] * (len(data_reduction_df) - 1) + ["total"],
        x=data_reduction_df["Category"],
        textposition="outside",
        text=[f"{amt:,.0f}" for amt in data_reduction_df["Amount"]],
        y=data_reduction_df["Amount"],
        connector={"line": {"color": "black", "width": 2}},
        decreasing={"marker": {"color": "orange"}},
        increasing={"marker": {"color": "skyblue"}},
        totals={"marker": {"color": "skyblue"}},
    )
)

fig.update_layout(
    title="Enhanced Data Reduction Waterfall Chart",
    xaxis=dict(title="Category"),
    yaxis=dict(title="Amount", range=[0, 5500]),
    waterfallgap=0.3,
)
fig.show()

9 Exploratory Data Analysis: Golden Record

9.1 Comparing Cardholders and Non-Cardholders

9.1.7 Comparison of Average Feature Values

def plot_grouped_comparison(cardholders, non_cardholders, feature_columns):
    """
    Plots grouped bar charts for average feature values of cardholders and non-cardholders.

    Parameters:
    - cardholders (pd.DataFrame): DataFrame containing data for cardholders.
    - non_cardholders (pd.DataFrame): DataFrame containing data for non-cardholders.
    - feature_columns (list of str): List of column names whose averages to compare.
    """
    cardholder_avg = cardholders[feature_columns].mean()
    non_cardholder_avg = non_cardholders[feature_columns].mean()

    index = range(len(feature_columns))
    bar_width = 0.35

    fig, ax = plt.subplots()
    bars1 = ax.bar(
        index, cardholder_avg, bar_width, label="Cardholders", color="skyblue"
    )
    bars2 = ax.bar(
        [p + bar_width for p in index],
        non_cardholder_avg,
        bar_width,
        label="Non-Cardholders",
        color="orange",
    )

    ax.set_xlabel("Feature")
    ax.set_ylabel("Average Value")
    ax.set_title("Average Feature Values by Group")
    ax.set_xticks([p + bar_width / 2 for p in index])
    ax.set_xticklabels(feature_columns)
    ax.legend()

    plt.xticks(rotation=45)  # Rotate feature names for better visibility
    plt.show()


plot_grouped_comparison(
    golden_cardholders,
    golden_non_cardholders,
    [col for col in golden_record_df.columns if "balance" in col],
)
plot_grouped_comparison(golden_cardholders, golden_non_cardholders, ["loan_amount"])

## DEPENDENCIES TODO REMOVE FOR MERGE

# save golden record to temp
golden_record_df.to_parquet("temp/golden_record.parquet")
## DEPENDENCY #TODO REMOVE FOR MERGE

import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

golden_record_df = pd.read_parquet('temp/golden_record.parquet')

np.random.seed(1337)
random.seed(1337)

10 Data Partitioning

The data is split in a 80/20 ratio for training and testing purposes. The stratification ensures that the distribution of the target variable is maintained in both sets. When actually training the models, we will additionally use cross-validation to ensure robust evaluation.

from sklearn.model_selection import train_test_split


class DataModule:
    def __init__(self, X_train, X_test, y_train, y_test, feature_columns=None):
        self.feature_columns = (
            feature_columns if feature_columns is not None else X_train.columns
        )

        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test


def create_data_module(df, feature_cols, target_col="has_card", test_size=0.2):
    X = df.drop(columns=[target_col])[feature_cols]
    y = df[target_col]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, stratify=y, shuffle=True
    )

    return DataModule(X_train, X_test, y_train, y_test)


data_module = create_data_module(
    golden_record_df, golden_record_df.drop(columns=["has_card"]).columns
)

print(f"Train set size: {len(data_module.X_train)}")
print(f"Test set size: {len(data_module.X_test)}")

print(f"Train set distribution:\n{data_module.y_train.value_counts(normalize=True)}")
print(f"Test set distribution:\n{data_module.y_test.value_counts(normalize=True)}")
Train set size: 524
Test set size: 132
Train set distribution:
has_card
False    0.5
True     0.5
Name: proportion, dtype: float64
Test set distribution:
has_card
True     0.5
False    0.5
Name: proportion, dtype: float64

As we can see the distribution of the target variable is maintained in both sets after the split.

11 Model Construction

11.1 Pipeline for Training and Evaluation

The train_evaluate_model function is designed to streamline the process of training and evaluating machine learning models. It performs the following steps:

  1. Preprocessing: The function automatically handles numerical and categorical features, imputing missing values, scaling numerical features, and one-hot encoding categorical features.
  2. Model Training: The specified model is trained on the training data.
  3. Cross-Validation: The model is evaluated using cross-validation with specified evaluation metrics.
  4. Model Evaluation: The model is evaluated on the test set using various metrics, including accuracy, F1 score, AUC-ROC, precision, and recall.

The pipeline is flexible and can accommodate various models and feature sets, making it a versatile tool for model development and evaluation. It returns a summary of evaluation metrics for both training and test sets, as well as the true labels and predicted probabilities for the test set.

from sklearn.feature_selection import RFECV
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn.metrics import (
    make_scorer,
    f1_score,
    roc_auc_score,
    precision_score,
    recall_score,
)
import scikitplot as skplt
import dalex as dx


class Trainer:
    def __init__(
        self,
        data_module,
        model,
        cv=10,
        select_features=False,
        param_grid=None,
        verbose=False,
        n_jobs=-1,
    ):
        self.data_module = data_module
        self.model = model
        self.cv = cv
        self.verbose = verbose
        self.preprocessor = self._create_preprocessor()
        self.select_features = select_features
        self.param_grid = param_grid
        self.n_jobs = n_jobs
        self.pipeline = None
        self.train_metrics_report = None
        self.test_metrics_report = None

    def _create_preprocessor(self):
        numerical_features = [
            col
            for col in self.data_module.X_train.columns
            if self.data_module.X_train[col].dtype in ["int64", "float64"]
        ]
        categorical_features = [
            col
            for col in self.data_module.X_train.columns
            if col not in numerical_features
        ]

        other_features = [
            col
            for col in self.data_module.X_train.columns
            if col not in numerical_features + categorical_features
        ]
        if len(other_features) > 0:
            raise ValueError(
                f"Columns with unsupported data types found: {other_features}"
            )

        numerical_pipeline = Pipeline(
            [("imputer", SimpleImputer(strategy="mean")), ("scaler", StandardScaler())]
        )

        categorical_pipeline = Pipeline(
            [
                ("imputer", SimpleImputer(strategy="most_frequent")),
                ("onehot", OneHotEncoder(handle_unknown="ignore")),
            ]
        )

        return ColumnTransformer(
            transformers=[
                ("num", numerical_pipeline, numerical_features),
                ("cat", categorical_pipeline, categorical_features),
            ]
        )

    def fit(self):
        model_pipeline_steps = [("model", self.model)]
        if self.select_features:
            model_pipeline_steps.insert(
                0,
                (
                    "feature_selection",
                    RFECV(self.model, verbose=3 if self.verbose else 0, cv=self.cv),
                )
            )
            
        model_pipeline = Pipeline(model_pipeline_steps)
        
        if self.param_grid is not None:
            model_pipeline = GridSearchCV(
                model_pipeline,
                self.param_grid,
                cv=self.cv,
                verbose=3 if self.verbose else 0,
                n_jobs=self.n_jobs,
            )

        self.pipeline = Pipeline(
            [("preprocessor", self.preprocessor), ("model_pipeline", model_pipeline)]
        )

        self.pipeline.fit(self.data_module.X_train, self.data_module.y_train)
        return self

    @staticmethod
    def get_scoring_metrics():
        return ["accuracy", "f1_macro", "roc_auc", "precision", "recall"]

    def eval_train(self):
        scoring = {
            "accuracy": "accuracy",
            "f1_macro": make_scorer(f1_score),
            "roc_auc": "roc_auc",
            "precision": make_scorer(precision_score),
            "recall": make_scorer(recall_score),
        }

        cv_results = cross_validate(
            self.pipeline,
            self.data_module.X_train,
            self.data_module.y_train,
            scoring=scoring,
            cv=self.cv,
            return_train_score=False,
            n_jobs=self.n_jobs,
            verbose=3 if self.verbose else 0,
            return_estimator=True,
            return_indices=True,
            error_score="raise",
        )

        self.train_metrics_report = {
            metric: {
                "folds": cv_results[f"test_{metric}"].tolist(),
                "mean": cv_results[f"test_{metric}"].mean(),
                "std": cv_results[f"test_{metric}"].std(),
            }
            for metric in scoring
        }

        roc_data = []
        for i in range(self.cv):
            estimator = cv_results["estimator"][i]
            train_indices, test_indices = (
                cv_results["indices"]["train"][i],
                cv_results["indices"]["test"][i],
            )

            true_labels = self.data_module.y_train.iloc[test_indices]
            y_pred_proba = estimator.predict_proba(
                self.data_module.X_train.iloc[test_indices]
            )
            roc_data.append((true_labels, y_pred_proba))

        self.train_metrics_report["roc_data"] = roc_data

        return self

    def eval_test(self):
        X_test, y_test = self.data_module.X_test, self.data_module.y_test
        y_pred_proba = (
            self.pipeline.predict_proba(X_test)[:, 1]
            if hasattr(self.pipeline, "predict_proba")
            else np.nan
        )
        test_metrics = {
            "accuracy": self.pipeline.score(X_test, y_test),
            "f1_macro": f1_score(
                y_test, self.pipeline.predict(X_test), average="macro"
            ),
            "roc_auc": (
                roc_auc_score(y_test, y_pred_proba)
                if hasattr(self.pipeline, "predict_proba")
                else np.nan
            ),
            "precision": precision_score(y_test, self.pipeline.predict(X_test)),
            "recall": recall_score(y_test, self.pipeline.predict(X_test)),
        }
        self.test_metrics_report = {
            metric: test_metrics[metric] for metric in test_metrics
        }

        return self

    def get_pipeline(self):
        return self.pipeline

    def get_preprocessor(self):
        return self.preprocessor

    def get_train_metrics_report(self):
        return self.train_metrics_report

    def get_test_metrics_report(self):
        return self.test_metrics_report

    def get_best_params(self):
        if self.param_grid is None:
            raise ValueError(
                "No hyperparameter grid was provided during model training."
            )

        best_param = self.pipeline["model_pipeline"].best_params_
        return {key.split('__')[1]: value for key, value in best_param.items()}

    def get_selected_features(self):
        if not self.select_features:
            raise ValueError("Feature selection was not enabled during model training.")

        if (
            self.pipeline is None
            or "feature_selection"
            not in self.pipeline.named_steps["model_pipeline"].named_steps
        ):
            raise ValueError(
                "Feature selection has not been performed or the model is not fitted."
            )

        rfe = self.pipeline.named_steps["model_pipeline"].named_steps[
            "feature_selection"
        ]
        feature_mask = rfe.support_

        feature_names = self._get_feature_names_from_preprocessor()

        selected_features = [
            feature
            for feature, is_selected in zip(feature_names, feature_mask)
            if is_selected
        ]
        return [
            feature
            for feature in self.data_module.feature_columns
            if any([feature in col for col in selected_features])
        ]

    def _get_feature_names_from_preprocessor(self):
        transformers = self.preprocessor.transformers_
        feature_names = []

        for name, transformer, column in transformers:
            if hasattr(transformer, "get_feature_names_out"):
                feature_names.extend(transformer.get_feature_names_out(column))
            else:
                feature_names.extend(column)

        return feature_names

The following class handles the visualization of the model evaluation results. It provides various plots and metrics to assess the model’s performance and interpretability. The class can be used to compare multiple models and visualize their evaluation metrics side by side or individually. There is a distinction made between training and test metrics to ensure a comprehensive evaluation of the model’s performance.

from sklearn.metrics import roc_curve, classification_report, precision_recall_curve


class Visualizer:
    def __init__(self, trainer, model_name):
        self.trainer = trainer
        self.model_name = model_name

        X_train, X_test, y_train, y_test = (
            self.trainer.data_module.X_train,
            self.trainer.data_module.X_test,
            self.trainer.data_module.y_train,
            self.trainer.data_module.y_test,
        )

        self.explainer = dx.Explainer(trainer.get_pipeline(), X_test, y_test)

        self.X_test = X_test
        self.y_true = y_test
        self.y_test_pred_proba = trainer.get_pipeline().predict_proba(X_test)

    @staticmethod
    def compare_evaluation_metrics(visualizers):
        model_names = [viz.model_name for viz in visualizers]

        metrics = Trainer.get_scoring_metrics()
        means = {metric: [] for metric in metrics}
        stds = {metric: [] for metric in metrics}
        for viz in visualizers:
            train_metrics = viz.trainer.get_train_metrics_report()
            for metric in metrics:
                means[metric].append(np.mean(train_metrics[metric]["folds"]))
                stds[metric].append(np.std(train_metrics[metric]["folds"]))

        n_groups = len(metrics)
        bar_width = 0.15
        index = np.arange(n_groups)
        opacity = 0.8

        plt.figure(figsize=(15, 8))
        colors = plt.cm.viridis(np.linspace(0, 1, len(model_names)))

        for i, model_name in enumerate(model_names):
            bar_positions = index + bar_width * i
            bar_values = [means[metric][i] for metric in metrics]
            error_values = [stds[metric][i] for metric in metrics]

            bars = plt.bar(
                bar_positions,
                bar_values,
                bar_width,
                alpha=opacity,
                color=colors[i],
                yerr=error_values,
                capsize=5,
                label=model_name,
            )

            for bar, error in zip(bars, error_values):
                yval = bar.get_height()
                text_position = yval + error + 0.02
                plt.text(
                    bar.get_x() + bar.get_width() / 2,
                    text_position,
                    f"{yval:.2f}",
                    ha="center",
                    va="bottom",
                    fontsize=10,
                )

        plt.xlabel("Metrics", fontsize=14)
        plt.ylabel("Scores", fontsize=14)
        plt.title(
            f"Cross-Validation (k={visualizers[0].trainer.cv}) Evaluation Metrics Comparison",
            fontsize=16,
        )
        plt.xticks(index + bar_width * (len(model_names) - 1) / 2, metrics, fontsize=12)
        plt.ylim(0, 1.1)
        plt.legend(loc="upper left", bbox_to_anchor=(1, 1))

        plt.grid(True, which="major", linestyle="--", linewidth="0.5", color="grey")
        plt.tight_layout()
        plt.show()

    @staticmethod
    def compare_roc_curves(visualizers, dataset):
        if dataset not in ["test", "eval"]:
            raise ValueError("Invalid dataset option. Choose 'test' or 'eval'.")

        plt.figure(figsize=(8, 8))
        colors = plt.cm.viridis(np.linspace(0, 1, len(visualizers)))

        for i, viz in enumerate(visualizers):
            if dataset == "test":
                y_true = viz.trainer.data_module.y_test
                y_scores = viz.trainer.get_trained_model().predict_proba(
                    viz.trainer.data_module.X_test
                )[:, 1]
            elif dataset == "eval":
                y_true = []
                y_scores = []
                for fold in viz.trainer.get_train_metrics_report()["roc_data"]:
                    y_true.extend(fold[0])
                    y_scores.extend(fold[1][:, 1])

            fpr, tpr, _ = roc_curve(y_true, y_scores)
            auc_score = roc_auc_score(y_true, y_scores)
            plt.plot(
                fpr,
                tpr,
                label=f"{viz.model_name} (AUC = {auc_score:.2f})",
                color=colors[i],
            )

        plt.plot([0, 1], [0, 1], "k--")
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title(f"ROC Curve Comparison on {dataset.capitalize()} Set")
        plt.legend(loc="lower right")
        plt.show()

    def plot_validation_metrics(self):
        train_metrics = self.trainer.get_train_metrics_report()
        cv = len(train_metrics["accuracy"]["folds"])

        metrics = self.trainer.get_scoring_metrics()
        fold_scores = {metric: train_metrics[metric]["folds"] for metric in metrics}

        plt.boxplot(fold_scores.values(), labels=metrics, notch=True)
        plt.title(f"{self.model_name}: Validation Metrics Box Plot (CV={cv})")
        plt.xlabel("Metrics")
        plt.ylabel("Score")
        plt.ylim(0, 1)
        plt.grid(True)
        plt.show()

    def plot_test_metrics(self):
        test_metrics = self.trainer.get_test_metrics_report()
        test_values = list(test_metrics.values())
        test_names = list(test_metrics.keys())

        sns.barplot(x=test_names, y=test_values)
        plt.title(f"{self.model_name}: Test Metrics")
        plt.xlabel("Metrics")
        plt.ylabel("Score")
        for i, v in enumerate(test_values):
            if np.isnan(v):
                plt.text(i, 0.5, "N/A", ha="center", va="bottom")
            else:
                plt.text(i, v + 0.01, f"{v:.2f}", ha="center", va="bottom")
        plt.ylim(0, 1)
        plt.grid(True)
        plt.show()

    def plot_confusion_matrix_test(self):
        preds = self.y_test_pred_proba.argmax(axis=1)
        skplt.metrics.plot_confusion_matrix(self.y_true, preds)
        plt.title(f"{self.model_name}: Confusion Matrix")
        plt.show()

    def plot_classification_report_test(self):
        preds = self.y_test_pred_proba.argmax(axis=1)
        report = classification_report(self.y_true, preds, output_dict=True)

        report_df = pd.DataFrame(report).transpose()
        report_df = report_df.round(2)

        table = plt.table(
            cellText=report_df.values,
            colLabels=report_df.columns,
            rowLabels=report_df.index,
            cellLoc="center",
            rowLoc="center",
            loc="center",
            fontsize=12,
        )
        table.auto_set_font_size(False)
        table.set_fontsize(12)
        table.scale(1.2, 1.2)

        plt.axis("off")
        plt.title(f"{self.model_name}: Classification Report")
        plt.show()

    def plot_threshold_optimization_test(self):
        precision, recall, thresholds = precision_recall_curve(
            self.y_true, self.y_test_pred_proba[:, 1]
        )
        f1_scores = 2 * (precision * recall) / (precision + recall)
        optimal_idx = np.argmax(f1_scores)
        optimal_threshold = thresholds[optimal_idx]

        plt.plot(thresholds, f1_scores[:-1], label="F1-score")
        plt.axvline(
            x=optimal_threshold,
            color="red",
            linestyle="--",
            label=f"Optimal Threshold: {optimal_threshold:.2f}",
        )
        plt.title(f"{self.model_name}: Threshold Optimization")
        plt.xlabel("Threshold")
        plt.ylabel("F1-score")
        plt.legend()
        plt.show()

    def plot_roc_curve_test(self):
        skplt.metrics.plot_roc(
            self.y_true, self.y_test_pred_proba, plot_micro=False, plot_macro=False
        )
        plt.title(f"{self.model_name}: ROC Curve on Test Set")
        plt.show()

    def plot_roc_curve_eval(self, show_folds=False):
        fig, ax = plt.subplots(figsize=(8, 8))
        colors = plt.cm.viridis(np.linspace(0, 1, self.trainer.cv))

        roc_data = self.trainer.get_train_metrics_report()["roc_data"]
        for k in range(self.trainer.cv):
            true_labels, y_pred_proba = roc_data[k]
            fpr, tpr, _ = roc_curve(true_labels, y_pred_proba[:, 1])
            auc_score = roc_auc_score(true_labels, y_pred_proba[:, 1])
            ax.plot(
                fpr, tpr, color=colors[k], label=f"Fold {k + 1} (AUC = {auc_score:.2f})"
            )

        plt.title(
            f"{self.model_name}: ROC Curves for each fold (CV={self.trainer.cv}, "
            f'Mean AUROC={self.trainer.train_metrics_report["roc_auc"]["mean"]:.2f})'
        )
        if show_folds:
            plt.legend(loc="lower right")

        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.grid(True)
        plt.show()

    def plot_precision_recall_curve_test(self):
        skplt.metrics.plot_precision_recall(self.y_true, self.y_test_pred_proba)
        plt.title(f"{self.model_name}: Precision-Recall Curve on Test Set")
        plt.show()

    def plot_lift_curve_test(self):
        skplt.metrics.plot_lift_curve(self.y_true, self.y_test_pred_proba)
        plt.title(f"{self.model_name}: Lift Curve on Test Set")
        plt.show()

    def plot_cumulative_gain_curve_test(self):
        skplt.metrics.plot_cumulative_gain(self.y_true, self.y_test_pred_proba)
        plt.title(f"{self.model_name}: Cumulative Gain Curve on Test Set")
        plt.show()

    def plot_partial_dependence_test(self, feature):
        pdp = self.explainer.model_profile(type="partial", variables=feature)
        pdp.plot()

    def plot_accumulated_local_effects_test(self, feature):
        ale = self.explainer.model_profile(type="accumulated", variables=feature)
        ale.plot()

    def plot_breakdown_test(self, observation):
        breakdown = self.explainer.predict_parts(observation, type="break_down")
        breakdown.plot()

    def plot_model_explanations_test(self):
        feature_importance = self.explainer.model_parts()
        feature_importance.plot()

        model_profile = self.explainer.model_profile(type="partial")
        model_profile.plot()

    def visualize_explanations_test(self, feature_columns=[]):
        self.plot_model_explanations()

        if not feature_columns:
            feature_columns = self.trainer.data_module.feature_columns[0]

        self.plot_partial_dependence(feature_columns)
        self.plot_accumulated_local_effects(feature_columns)

        observation = self.trainer.data_module.X_test.iloc[0]
        self.plot_breakdown(observation)

        plt.show()

11.2 Baseline Model: Logistic Regression

baseline_feature_columns = ["age", "client_region"] + [
    col
    for col in golden_record_df.columns
    if "M_" in col and ("_balance" in col or "_volume" in col)
]

baseline_data_module = create_data_module(golden_record_df, baseline_feature_columns)

print(f"Number of baseline feature columns: {len(baseline_feature_columns)}")
print(f"Baseline feature columns: {baseline_feature_columns}")
Number of baseline feature columns: 26
Baseline feature columns: ['age', 'client_region', 'M_2_volume', 'M_3_volume', 'M_4_volume', 'M_5_volume', 'M_6_volume', 'M_7_volume', 'M_8_volume', 'M_9_volume', 'M_10_volume', 'M_11_volume', 'M_12_volume', 'M_13_volume', 'M_2_balance', 'M_3_balance', 'M_4_balance', 'M_5_balance', 'M_6_balance', 'M_7_balance', 'M_8_balance', 'M_9_balance', 'M_10_balance', 'M_11_balance', 'M_12_balance', 'M_13_balance']
from sklearn.linear_model import LogisticRegression

baseline_trainer = (
    Trainer(baseline_data_module, LogisticRegression(max_iter=10000)).fit().eval_train()
)

baseline_visualizer = Visualizer(baseline_trainer, "Baseline Logistic Regression")
baseline_visualizer.plot_validation_metrics()
Preparation of a new explainer is initiated

  -> data              : 132 rows 26 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 132 values
  -> model_class       : sklearn.pipeline.Pipeline (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_proba_default at 0x7fc1d01214e0> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 0.0111, mean = 0.518, max = 1.0
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.971, mean = -0.0183, max = 0.803
  -> model_info        : package sklearn

A new explainer has been created!

baseline_visualizer.plot_roc_curve_eval(show_folds=True)

11.3 Adding more features

In order to possibly improve the model performance, we will include more features in the training data. We will include all features except for the ones that are not relevant for the model training.

After merging the transactional and non-transactional data, we have many columns that are unnecessary for model training. We will remove all columns containing card-related information, except for the has_card column. This decision stems from the fact that 50% of our dataset consists of cardholders and the other 50% consists of non-cardholders, which we matched with the cardholders. Therefore, the data in the non-target card-related columns come from the actual cardholders.

Additionally we will remove all columns that contain time-dependent information, such as dates and IDs, as they are not relevant for the model.

num_cols_before = len(golden_record_df.columns)
golden_record_df = golden_record_df.loc[
    :,
    ~golden_record_df.columns.str.contains("card")
    | golden_record_df.columns.str.contains("has_card"),
]
print(
    f"Removed {num_cols_before - len(golden_record_df.columns)} card-related columns. Now {len(golden_record_df.columns)} columns remain."
)

num_cols_before = len(golden_record_df.columns)
golden_record_df = golden_record_df.drop(
    columns=["loan_granted_date", "birth_date", "account_created"]
)
print(
    f"Removed {num_cols_before - len(golden_record_df.columns)} time-dependent columns. Now {len(golden_record_df.columns)} columns remain."
)

num_cols_before = len(golden_record_df.columns)
golden_record_df = golden_record_df.drop(
    columns=[
        "loan_account_id",
        "loan_loan_id",
        "order_account_id",
        "client_district_name",
        "disp_id",
        "account_id",
        "account_district_name",
    ]
)
print(
    f"Removed {num_cols_before - len(golden_record_df.columns)} ID columns. Now {len(golden_record_df.columns)} columns remain."
)

num_cols_before = len(golden_record_df.columns)
golden_record_df = golden_record_df.drop(
    columns=[col for col in golden_record_df.columns if "std" in col]
)
print(
    f"Removed {num_cols_before - len(golden_record_df.columns)} std columns. Now {len(golden_record_df.columns)} columns remain."
)

cols_to_exclude_in_train = ["client_id", "has_card"]
all_cols_data_module = create_data_module(
    golden_record_df, golden_record_df.drop(columns=cols_to_exclude_in_train).columns
)
Removed 6 card-related columns. Now 226 columns remain.
Removed 3 time-dependent columns. Now 223 columns remain.
Removed 7 ID columns. Now 216 columns remain.
Removed 12 std columns. Now 204 columns remain.

11.4 Candidate Models

11.4.1 Logistic Regression

We will train a logistic regression model with the new feature set and evaluate its performance as it already showed promising results in the baseline model.

log_reg_trainer = (
    Trainer(all_cols_data_module, LogisticRegression(max_iter=10000)).fit().eval_train()
)

log_reg_visualizer = Visualizer(log_reg_trainer, "Logistic Regression")
log_reg_visualizer.plot_validation_metrics()
Preparation of a new explainer is initiated

  -> data              : 132 rows 202 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 132 values
  -> model_class       : sklearn.pipeline.Pipeline (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_proba_default at 0x7fc1d01214e0> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 8.4e-06, mean = 0.479, max = 1.0
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -1.0, mean = 0.0214, max = 0.981
  -> model_info        : package sklearn

A new explainer has been created!

log_reg_visualizer.plot_roc_curve_eval(show_folds=True)

11.4.2 Random Forest

We will also train a Random Forest model to see if it can outperform the logistic regression model. Random Forest models are known for their robustness and ability to capture complex relationships in the data.

from sklearn.ensemble import RandomForestClassifier

rf_trainer = (
    Trainer(
        all_cols_data_module,
        RandomForestClassifier(),
    )
    .fit()
    .eval_train()
)

rf_visualizer = Visualizer(rf_trainer, "Random Forest")
rf_visualizer.plot_validation_metrics()
Preparation of a new explainer is initiated

  -> data              : 132 rows 202 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 132 values
  -> model_class       : sklearn.pipeline.Pipeline (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_proba_default at 0x7fc1d01214e0> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 0.0, mean = 0.489, max = 0.98
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.86, mean = 0.0111, max = 0.74
  -> model_info        : package sklearn

A new explainer has been created!

rf_visualizer.plot_roc_curve_eval(show_folds=True)

11.4.3 Decision Tree

We will also train a Decision Tree model to see how it performs compared to the other models. Decision Trees are known for their interpretability and simplicity.

from sklearn.tree import DecisionTreeClassifier

decision_tree_trainer = (
    Trainer(
        all_cols_data_module,
        DecisionTreeClassifier(),
    )
    .fit()
    .eval_train()
)

decision_tree_visualizer = Visualizer(decision_tree_trainer, "Decision Tree")
decision_tree_visualizer.plot_validation_metrics()
Preparation of a new explainer is initiated

  -> data              : 132 rows 202 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 132 values
  -> model_class       : sklearn.pipeline.Pipeline (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_proba_default at 0x7fc1d01214e0> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 0.0, mean = 0.477, max = 1.0
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -1.0, mean = 0.0227, max = 1.0
  -> model_info        : package sklearn

A new explainer has been created!

decision_tree_visualizer.plot_roc_curve_eval(show_folds=True)

11.4.4 Gradient Boosting

Finally, we will train a Gradient Boosting model to see if it can outperform the other models. Gradient Boosting models are known for their high accuracy and ability to capture complex relationships in the data.

from sklearn.ensemble import GradientBoostingClassifier

gradient_boost_trainer = (
    Trainer(
        all_cols_data_module,
        GradientBoostingClassifier(),
    )
    .fit()
    .eval_train()
)

gradient_boost_visualizer = Visualizer(gradient_boost_trainer, "Gradient Boosting")
gradient_boost_visualizer.plot_validation_metrics()
Preparation of a new explainer is initiated

  -> data              : 132 rows 202 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 132 values
  -> model_class       : sklearn.pipeline.Pipeline (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_proba_default at 0x7fc1d01214e0> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 0.00355, mean = 0.5, max = 0.985
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.985, mean = 0.000161, max = 0.875
  -> model_info        : package sklearn

A new explainer has been created!

gradient_boost_visualizer.plot_roc_curve_eval(show_folds=True)

12 Model Comparison & Selection

candidate_trainers = [
    baseline_trainer,
    log_reg_trainer,
    rf_trainer,
    decision_tree_trainer,
    gradient_boost_trainer,
]
candidate_visualizers = [
    baseline_visualizer,
    log_reg_visualizer,
    rf_visualizer,
    decision_tree_visualizer,
    gradient_boost_visualizer,
]
Visualizer.compare_evaluation_metrics(candidate_visualizers)

Visualizer.compare_roc_curves(candidate_visualizers, dataset="eval")

12.1 Top-N Customer Selection

We will now use the trained models to generate a list of the top N customers who are most likely to not have a card. We will compare the lists generated by each model to see if there is any overlap in the predictions.

def create_top_n_customers_list(model, data):
    mandatory_columns = ["client_id", "has_card"]

    if not hasattr(model, "predict_proba"):
        raise ValueError("Model does not support probability predictions")

    if not all(col in data.columns for col in mandatory_columns):
        raise ValueError("Mandatory columns not found in data: 'client_id', 'has_card'")

    data = data[data["has_card"] == 0]

    probabilities = model.predict_proba(data.copy())
    # Probability of having a card (class 1). This essentially gives the clients who should most likely have a card based on the model but don't have one.
    probabilities = probabilities[:, 1]

    results = pd.DataFrame(
        {"Client ID": data["client_id"], "Probability": probabilities}
    )

    return results.sort_values(by="Probability", ascending=False).reset_index(drop=True)


def compare_top_n_lists(*lists, labels, top_n_percent):
    if len(lists) != len(labels):
        raise ValueError("Each list must have a corresponding label")

    if len(set([len(l) for l in lists])) != 1:
        raise ValueError("All lists must have the same length")

    for l in lists:
        if not l["Probability"].is_monotonic_decreasing:
            raise ValueError("Lists must be sorted in descending order of probability")

    top_n = int(len(lists[0]) * top_n_percent)
    lists = [l.head(top_n) for l in lists]

    overlap_matrix = pd.DataFrame(0, index=labels, columns=labels)

    for i, list1 in enumerate(lists):
        set1 = set(list1["Client ID"])
        for j, list2 in enumerate(lists):
            set2 = set(list2["Client ID"])
            overlap_matrix.iloc[i, j] = len(set1.intersection(set2))

    overlap_matrix = overlap_matrix / len(lists[0])
    return overlap_matrix


def visualize_overlap_matrix(overlap_matrix, title):
    plt.figure(figsize=(10, 8))

    mask = np.tril(np.ones_like(overlap_matrix, dtype=bool))
    overlap_matrix = overlap_matrix.mask(mask)

    sns.heatmap(
        overlap_matrix,
        annot=True,
        cmap="Blues",
        cbar_kws={"label": "Common Customers [%]"},
    )
    plt.title(title)
    plt.ylabel("List from Model/Method")
    plt.xlabel("List from Model/Method")
    plt.xticks(
        ticks=np.arange(len(overlap_matrix.columns)) + 0.5,
        labels=overlap_matrix.columns,
        rotation=45,
        ha="right",
    )
    plt.yticks(
        ticks=np.arange(len(overlap_matrix.index)) + 0.5,
        labels=overlap_matrix.index,
        rotation=0,
    )
    plt.show()

12.1.1 Top-10% Customer Selection

We will select the top 10% of customers who are most likely to not have a card according to each model.

customer_lists = [
    create_top_n_customers_list(trainer.get_pipeline(), golden_record_df)
    for trainer in candidate_trainers
]
candidate_labels = [
    "Baseline",
    "Logistic Regression",
    "Random Forest",
    "Decision Tree",
    "Gradient Boosting",
]
top_10_overlap_matrix = compare_top_n_lists(
    *customer_lists, labels=candidate_labels, top_n_percent=0.1
)
visualize_overlap_matrix(
    top_10_overlap_matrix, "Overlap of Top-10% Customer Lists by Model"
)

12.1.2 Top-5% Customer Selection

We will select the top 5% of customers who are most likely to not have a card according to each model.

top_5_overlap_matrix = compare_top_n_lists(
    *customer_lists, labels=candidate_labels, top_n_percent=0.05
)
visualize_overlap_matrix(
    top_5_overlap_matrix, "Overlap of Top-5% Customer Lists by Model"
)

12.2 Selected Model: Logistic Regression

best_model_trainer = log_reg_trainer
best_model_visualizer = log_reg_visualizer

best_model_trainer.eval_test()
<__main__.Trainer at 0x7fc1f349d250>
best_model_visualizer.plot_test_metrics()

best_model_visualizer.plot_roc_curve_test()

_, _ = (
    best_model_visualizer.plot_confusion_matrix_test(),
    best_model_visualizer.plot_classification_report_test(),
)

13 Model Optimization

After selecting the best model, we can further optimize its hyperparameters to improve its performance. We will use grid search with cross-validation to find the best hyperparameters for the logistic regression model.

gs_param_grid = {
    "model__C": [0.001, 0.01, 0.1, 1, 10, 100],
    "model__penalty": ["l1", "l2"],
    "model__solver": ["liblinear"],
}

gs_trainer = (
    Trainer(
        all_cols_data_module,
        LogisticRegression(),
        param_grid=gs_param_grid,
        verbose=True,
    )
    .fit()
    .eval_train()
)

gs_trainer.get_best_params()
Fitting 10 folds for each of 12 candidates, totalling 120 fits
[CV 1/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.509 total time=   0.0s
[CV 3/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.491 total time=   0.0s
[CV 5/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.500 total time=   0.0s
[CV 6/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.500 total time=   0.0s
[CV 9/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.500 total time=   0.0s
[CV 10/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.500 total time=   0.0s
[CV 3/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.774 total time=   0.0s
[CV 4/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.811 total time=   0.0s
[CV 5/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.750 total time=   0.0s
[CV 6/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.731 total time=   0.0s
[CV 1/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.792 total time=   0.0s
[CV 2/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 3/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.849 total time=   0.0s
[CV 4/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.736 total time=   0.0s
[CV 9/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.827 total time=   0.0s
[CV 10/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.808 total time=   0.0s
[CV 1/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.811 total time=   0.0s
[CV 2/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.811 total time=   0.0s
[CV 3/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.792 total time=   0.0s
[CV 4/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.774 total time=   0.0s
[CV 5/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.846 total time=   0.0s
[CV 6/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.731 total time=   0.0s
[CV 5/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.865 total time=   0.0s
[CV 6/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.769 total time=   0.0s
[CV 7/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.846 total time=   0.0s
[CV 8/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.769 total time=   0.0s
[CV 9/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.827 total time=   0.0s
[CV 10/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.769 total time=   0.0s
[CV 1/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.736 total time=   0.0s
[CV 2/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.792 total time=   0.0s
[CV 1/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.717 total time=   0.0s
[CV 2/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.736 total time=   0.0s
[CV 3/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.774 total time=   0.0s
[CV 4/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.755 total time=   0.1s
[CV 5/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.865 total time=   0.1s
[CV 6/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.731 total time=   0.0s
[CV 7/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.808 total time=   0.0s
[CV 8/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.712 total time=   0.0s
[CV 9/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.769 total time=   0.1s
[CV 10/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.827 total time=   0.0s
[CV 1/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.698 total time=   0.0s
[CV 2/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.698 total time=   0.0s
[CV 3/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.717 total time=   0.0s
[CV 4/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.755 total time=   0.0s
[CV 5/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.904 total time=   0.0s
[CV 6/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.769 total time=   0.0s
[CV 3/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.679 total time=   0.1s
[CV 4/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.736 total time=   0.1s
[CV 5/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.885 total time=   0.1s
[CV 6/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.788 total time=   0.1s
[CV 7/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.769 total time=   0.1s
[CV 8/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.635 total time=   0.1s
[CV 9/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.635 total time=   0.1s
[CV 10/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.788 total time=   0.1s
[CV 1/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.623 total time=   0.9s
[CV 2/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.585 total time=   0.8s
[CV 3/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.679 total time=   1.3s
[CV 5/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.827 total time=   1.1s
[CV 6/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.769 total time=   1.0s
[CV 8/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.596 total time=   1.2s
[CV 10/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.808 total time=   1.8s
Fitting 10 folds for each of 12 candidates, totalling 120 fits
[CV 2/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 1/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.500 total time=   0.0s
[CV 4/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 3/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 5/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 7/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 6/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 8/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 9/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 10/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 1/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.729 total time=   0.0s
[CV 2/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.681 total time=   0.0s
[CV 4/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 3/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 5/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 6/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 2/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.509 total time=   0.0s
[CV 4/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.491 total time=   0.0s
[CV 7/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.500 total time=   0.0s
[CV 8/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.500 total time=   0.0s
[CV 1/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.755 total time=   0.0s
[CV 2/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.736 total time=   0.0s
[CV 7/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.827 total time=   0.0s
[CV 8/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.788 total time=   0.0s
[CV 9/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.731 total time=   0.0s
[CV 10/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.788 total time=   0.0s
[CV 5/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.808 total time=   0.0s
[CV 6/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.750 total time=   0.0s
[CV 7/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.769 total time=   0.0s
[CV 8/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.846 total time=   0.0s
[CV 7/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.827 total time=   0.0s
[CV 8/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.788 total time=   0.0s
[CV 9/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.788 total time=   0.0s
[CV 10/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.827 total time=   0.0s
[CV 1/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.755 total time=   0.0s
[CV 2/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.811 total time=   0.0s
[CV 3/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.811 total time=   0.0s
[CV 4/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.792 total time=   0.0s
[CV 3/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.755 total time=   0.0s
[CV 4/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.792 total time=   0.0s
[CV 5/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.885 total time=   0.0s
[CV 6/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.750 total time=   0.0s
[CV 7/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.769 total time=   0.0s
[CV 8/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.712 total time=   0.0s
[CV 9/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.788 total time=   0.0s
[CV 10/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.827 total time=   0.0s
[CV 7/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.769 total time=   0.0s
[CV 8/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.731 total time=   0.0s
[CV 9/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.731 total time=   0.0s
[CV 10/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.808 total time=   0.0s
[CV 1/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.642 total time=   0.3s
[CV 2/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.566 total time=   0.3s
[CV 3/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.717 total time=   0.3s
[CV 4/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.736 total time=   0.4s
[CV 5/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.885 total time=   0.7s
[CV 6/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.788 total time=   0.4s
[CV 7/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.808 total time=   0.4s
[CV 8/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.635 total time=   0.3s
[CV 9/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.596 total time=   0.4s
[CV 10/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.788 total time=   0.4s
[CV 1/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.660 total time=   0.1s
[CV 2/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.623 total time=   0.1s
[CV 4/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.660 total time=   1.5s
[CV 7/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.788 total time=   1.3s
[CV 9/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.577 total time=   1.0s
[CV 1/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.623 total time=   0.1s
[CV 2/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.585 total time=   0.2s
[CV 3/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.679 total time=   0.1s
[CV 4/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.755 total time=   0.2s
[CV 5/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.885 total time=   0.1s
[CV 6/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.750 total time=   0.2s
[CV 7/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.788 total time=   0.1s
[CV 8/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.635 total time=   0.2s
[CV 9/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.577 total time=   0.2s
[CV 10/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.827 total time=   0.1s
Fitting 10 folds for each of 12 candidates, totalling 120 fits
[CV 2/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 1/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.500 total time=   0.0s
[CV 4/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 3/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 5/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 6/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 7/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 8/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 9/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 10/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 1/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.750 total time=   0.0s
[CV 2/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.0s
[CV 4/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 3/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 6/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 5/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 7/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 8/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.0s
[CV 7/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 8/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.0s
[CV 9/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 10/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 1/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.854 total time=   0.0s
[CV 2/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.915 total time=   0.0s
[CV 4/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.872 total time=   0.0s
[CV 3/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.702 total time=   0.0s
[CV 5/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 6/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 8/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 7/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 9/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.872 total time=   0.0s
[CV 10/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 1/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.792 total time=   0.0s
[CV 2/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 4/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.894 total time=   0.0s
[CV 3/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 5/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 6/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 7/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 8/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 9/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 10/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 1/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.854 total time=   0.0s
[CV 3/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 2/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.872 total time=   0.0s
[CV 4/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.872 total time=   0.0s
[CV 5/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 6/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 7/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 9/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 8/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 10/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 1/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.792 total time=   0.0s
[CV 2/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 3/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 4/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.872 total time=   0.0s
[CV 5/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 6/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 7/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 8/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.0s
[CV 9/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 10/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 1/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.750 total time=   0.1s
[CV 2/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.1s
[CV 3/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.702 total time=   0.1s
[CV 4/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 5/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   0.1s
[CV 6/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.1s
[CV 8/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.702 total time=   0.1s
[CV 7/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.723 total time=   0.1s
[CV 9/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.809 total time=   0.1s
[CV 10/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.809 total time=   0.1s
[CV 1/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.750 total time=   0.1s
[CV 2/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.1s
[CV 4/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.1s
[CV 3/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.660 total time=   0.1s
[CV 6/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.851 total time=   0.1s
[CV 5/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.851 total time=   0.1s
[CV 7/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.1s
[CV 8/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.702 total time=   0.1s
[CV 9/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.1s
[CV 10/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.851 total time=   0.1s
[CV 2/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.702 total time=   0.4s
[CV 1/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.708 total time=   0.4s
[CV 3/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.723 total time=   0.4s
[CV 4/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.702 total time=   0.4s
[CV 5/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.4s
[CV 6/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   0.8s
[CV 7/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.723 total time=   0.7s
[CV 8/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.638 total time=   0.8s
[CV 9/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.660 total time=   0.8s
[CV 1/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.688 total time=   0.1s
[CV 2/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.702 total time=   0.1s
[CV 3/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.660 total time=   0.1s
[CV 4/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.1s
[CV 10/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.8s
[CV 10/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 2/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 1/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.792 total time=   0.0s
[CV 3/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.702 total time=   0.0s
[CV 4/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 6/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.723 total time=   0.0s
[CV 5/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 8/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 7/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 10/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 9/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 2/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 1/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.792 total time=   0.0s
[CV 4/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 3/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 5/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 6/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 8/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 7/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 10/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 9/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 1/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.729 total time=   0.0s
[CV 2/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 4/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.872 total time=   0.0s
[CV 3/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 5/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 6/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 7/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 8/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 10/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 9/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.872 total time=   0.0s
[CV 1/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.750 total time=   0.0s
[CV 2/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 3/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.0s
[CV 4/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 6/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 5/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 8/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.0s
[CV 7/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 9/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 10/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 1/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.729 total time=   0.1s
[CV 2/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.745 total time=   0.1s
[CV 3/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.745 total time=   0.1s
[CV 4/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.1s
[CV 5/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.809 total time=   0.1s
[CV 6/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.1s
[CV 7/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.745 total time=   0.1s
[CV 8/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.723 total time=   0.1s
[CV 9/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.1s
[CV 10/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.745 total time=   0.1s
[CV 1/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.771 total time=   0.1s
[CV 2/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.1s
[CV 3/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.1s
[CV 4/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.851 total time=   0.1s
[CV 5/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.1s
[CV 6/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.1s
[CV 7/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.1s
[CV 8/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.1s
[CV 9/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.1s
[CV 10/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.1s
[CV 2/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.702 total time=   0.5s
[CV 1/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.750 total time=   0.5s
[CV 3/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.638 total time=   0.5s
[CV 4/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   0.6s
[CV 5/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.809 total time=   0.4s
[CV 6/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.9s
[CV 7/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.702 total time=   0.9s
[CV 8/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.660 total time=   0.5s
[CV 9/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.660 total time=   0.6s
[CV 10/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.745 total time=   0.5s
[CV 1/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.771 total time=   0.1s
[CV 2/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.702 total time=   0.1s
[CV 3/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.617 total time=   0.1s
[CV 4/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.851 total time=   0.1s
[CV 5/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.1s
[CV 7/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.1s
[CV 6/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.1s
m��|���?�c�]�i�f��C��?ex��
��?1���b��$���K-���i�����?�+�d�࿡\���C�?�%��?�bI��S�?R?�W?��?�y��kI�[CV 5/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.851 total time=   0.1s
[CV 6/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.1s
[CV 7/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.1s
[CV 8/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.660 total time=   0.1s
[CV 9/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.702 total time=   0.1s
[CV 10/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.1s
[CV 1/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.646 total time=   0.5s
[CV 2/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.702 total time=   0.8s
[CV 3/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.723 total time=   0.7s
[CV 4/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.702 total time=   0.5s
[CV 5/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.7s
[CV 7/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.723 total time=   0.5s
[CV 6/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   1.1s
[CV 9/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.617 total time=   0.6s
[CV 8/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.638 total time=   1.1s
[CV 1/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.625 total time=   0.2s
[CV 2/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.702 total time=   0.1s
[CV 3/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.702 total time=   0.2s
[CV 4/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.2s
[CV 5/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.851 total time=   0.2s
[CV 6/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.2s
[CV 10/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.723 total time=   1.6s
[CV 7/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.2s
[CV 8/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.660 total time=   0.2s
[CV 9/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.617 total time=   0.2s
[CV 10/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.2s
[CV] END  accuracy: (test=0.755) f1_macro: (test=0.735) precision: (test=0.783) recall: (test=0.692) roc_auc: (test=0.855) total time=   9.8s
Fitting 10 folds for each of 12 candidates, totalling 120 fits
[CV 1/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.500 total time=   0.0s
[CV 2/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 3/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 4/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 5/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 6/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 8/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 7/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 10/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 9/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 1/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.750 total time=   0.0s
[CV 2/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 3/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 4/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 5/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 6/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 7/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 8/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.0s
[CV 9/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 10/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 2/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 1/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.792 total time=   0.0s
[CV 4/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 3/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 6/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.723 total time=   0.0s
[CV 5/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 7/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 8/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 9/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 10/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 1/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.833 total time=   0.0s
[CV 2/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 3/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.702 total time=   0.0s
[CV 4/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.894 total time=   0.0s
[CV 6/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 5/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 7/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 8/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 9/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 10/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 1/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.771 total time=   0.0s
[CV 2/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 3/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 4/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.872 total time=   0.0s
[CV 5/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.894 total time=   0.0s
[CV 6/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 7/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 8/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 9/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 10/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 8/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.681 total time=   0.1s
[CV 9/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.1s
[CV 10/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.1s
[CV 1/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.771 total time=   0.5s
[CV 2/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.660 total time=   0.7s
[CV 3/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.617 total time=   1.1s
[CV 5/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.6s
[CV 4/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.809 total time=   1.8s
[CV 6/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   1.5s
[CV 7/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.681 total time=   1.9s
[CV 8/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.660 total time=   1.8s
[CV 9/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.681 total time=   1.8s
[CV 1/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.771 total time=   0.2s
[CV 2/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.660 total time=   0.2s
[CV 3/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.596 total time=   0.2s
[CV 4/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.2s
[CV 5/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.2s
[CV 10/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   1.8s
[CV 6/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.2s
[CV 7/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.2s
[CV 8/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.638 total time=   0.2s
[CV 9/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.3s
[CV 10/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.2s
[CV] END  accuracy: (test=0.811) f1_macro: (test=0.800) precision: (test=0.833) recall: (test=0.769) roc_auc: (test=0.912) total time=  12.8s
Fitting 10 folds for each of 12 candidates, totalling 120 fits
[CV 1/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.500 total time=   0.0s
[CV 3/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 2/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 5/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 4/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 7/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 6/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 9/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 8/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 10/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 1/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.750 total time=   0.0s
[CV 2/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 3/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.702 total time=   0.0s
[CV 5/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 4/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 7/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 6/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 9/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 8/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.0s
[CV 1/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.812 total time=   0.0s
[CV 10/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 2/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 3/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 4/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 5/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 6/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 8/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 7/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 10/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 9/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 2/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 1/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.792 total time=   0.0s
[CV 3/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.702 total time=   0.0s
[CV 4/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.872 total time=   0.0s
[CV 6/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 5/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 7/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 8/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 9/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 10/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 1/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.729 total time=   0.0s
[CV 2/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 3/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 4/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 5/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.872 total time=   0.0s
[CV 6/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 7/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 9/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 8/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 10/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 1/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.750 total time=   0.0s
[CV 3/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.0s
[CV 2/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.1s
[CV 2/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 4/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.872 total time=   0.0s
[CV 3/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 6/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 5/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.872 total time=   0.0s
[CV 8/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.702 total time=   0.0s
[CV 7/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 9/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 10/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 2/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.723 total time=   0.1s
[CV 1/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.729 total time=   0.1s
[CV 3/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.1s
[CV 4/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.1s
[CV 5/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.1s
[CV 6/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.723 total time=   0.0s
[CV 7/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.1s
[CV 8/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.723 total time=   0.1s
[CV 10/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.1s
[CV 9/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.1s
[CV 1/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.729 total time=   0.1s
[CV 2/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.1s
[CV 3/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.1s
[CV 4/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.851 total time=   0.1s
[CV 5/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.894 total time=   0.1s
[CV 7/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.1s
[CV 6/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.1s
[CV 8/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.681 total time=   0.0s
[CV 9/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.1s
[CV 10/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.1s
[CV 1/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.708 total time=   0.3s
[CV 2/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.5s
[CV 3/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.723 total time=   0.7s
[CV 4/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.9s
[CV 5/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.872 total time=   0.7s
[CV 6/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.723 total time=   0.3s
[CV 7/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.4s
[CV 8/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.723 total time=   0.7s
[CV 9/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.723 total time=   0.3s
[CV 1/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.729 total time=   0.1s
[CV 2/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.660 total time=   0.1s
[CV 3/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.702 total time=   0.1s
[CV 4/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.851 total time=   0.1s
[CV 5/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.894 total time=   0.1s
[CV 6/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.1s
[CV 7/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.1s
[CV 10/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.9s
[CV 8/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.1s
[CV 9/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.2s
[CV 10/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.1s
[CV 1/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.688 total time=   0.5s
[CV 2/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.745 total time=   1.2s
[CV 3/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.723 total time=   1.0s
[CV 4/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.745 total time=   0.6s
[CV 5/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.915 total time=   0.4s
[CV 6/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.745 total time=   0.9s
[CV 8/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.638 total time=   1.0s
[CV 7/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   1.9s
[CV 9/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.660 total time=   0.6s
[CV 1/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.708 total time=   0.1s
[CV 2/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.702 total time=   0.2s
[CV 3/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.660 total time=   0.2s
[CV 4/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.2s
[CV 5/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.894 total time=   0.2s
[CV 10/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   1.5s
[CV 6/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.3s
[CV 7/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.2s
[CV 8/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.660 total time=   0.2s
[CV 9/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.3s
[CV 10/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.2s
[CV] END  accuracy: (test=0.811) f1_macro: (test=0.828) precision: (test=0.774) recall: (test=0.889) roc_auc: (test=0.902) total time=  10.7s
Fitting 10 folds for each of 12 candidates, totalling 120 fits
[CV 1/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.500 total time=   0.0s
[CV 2/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.500 total time=   0.0s
[CV 3/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 4/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 5/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 6/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 7/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
�`��~ m��~�y��~����~Б��~���~p���~����~���~p���~����~����~���~[CV 8/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 9/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 10/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 2/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.750 total time=   0.0s
[CV 1/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.750 total time=   0.0s
[CV 3/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.0s
[CV 4/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 6/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.702 total time=   0.0s
[CV 5/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 8/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 7/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 9/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 10/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 1/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.833 total time=   0.0s
[CV 2/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.833 total time=   0.0s
[CV 3/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.872 total time=   0.0s
[CV 4/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.702 total time=   0.0s
[CV 5/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 7/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 6/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 8/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 9/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 10/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 1/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.812 total time=   0.0s
[CV 2/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.833 total time=   0.0s
[CV 4/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 3/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 6/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.0s
[CV 5/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 8/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.0s
[CV 7/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 9/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 10/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 1/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.792 total time=   0.0s
[CV 2/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.833 total time=   0.0s
[CV 3/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 4/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 5/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 7/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 6/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 8/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 9/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.872 total time=   0.0s
[CV 10/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 1/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.750 total time=   0.0s
[CV 2/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.792 total time=   0.0s
[CV 4/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.0s
[CV 3/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 6/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 5/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 8/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 7/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 9/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 10/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 1/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.729 total time=   0.0s
[CV 2/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.750 total time=   0.1s
[CV 3/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.1s
[CV 4/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.702 total time=   0.1s
[CV 6/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.1s
[CV 5/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.1s
[CV 8/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.723 total time=   0.0s
[CV 7/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.1s
[CV 9/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.809 total time=   0.1s
[CV 10/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.1s
[CV 1/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.729 total time=   0.1s
[CV 2/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.729 total time=   0.1s
[CV 3/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.702 total time=   0.1s
[CV 4/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.681 total time=   0.1s
[CV 5/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.1s
[CV 6/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.1s
[CV 7/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.1s
[CV 8/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.1s
[CV 9/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.1s
[CV 10/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.1s
[CV 1/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.667 total time=   0.5s
[CV 2/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.708 total time=   0.7s
[CV 3/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.681 total time=   0.4s
[CV 4/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.617 total time=   0.5s
[CV 5/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.7s
[CV 4/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 5/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 7/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 6/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 9/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 8/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.660 total time=   0.0s
[CV 10/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 1/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.750 total time=   0.1s
[CV 2/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.702 total time=   0.1s
[CV 3/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.745 total time=   0.1s
[CV 4/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.809 total time=   0.1s
[CV 5/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.1s
[CV 6/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.702 total time=   0.1s
[CV 7/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 8/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.681 total time=   0.1s
[CV 9/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.1s
[CV 10/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   0.1s
[CV 1/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.750 total time=   0.1s
[CV 2/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.1s
[CV 3/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.681 total time=   0.1s
[CV 4/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.1s
[CV 5/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.1s
[CV 6/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.1s
[CV 7/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.1s
[CV 8/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.638 total time=   0.1s
[CV 9/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.1s
[CV 10/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.1s
[CV 1/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.667 total time=   0.5s
[CV 2/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.660 total time=   0.6s
[CV 4/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.5s
[CV 3/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.617 total time=   0.7s
[CV 5/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.872 total time=   1.0s
[CV 6/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.723 total time=   0.9s
[CV 7/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.809 total time=   0.7s
[CV 8/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.617 total time=   0.8s
[CV 9/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.638 total time=   0.9s
[CV 10/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.9s
[CV 1/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.688 total time=   0.1s
[CV 2/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.1s
[CV 3/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.596 total time=   0.1s
[CV 4/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.1s
[CV 5/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.894 total time=   0.1s
[CV 6/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.660 total time=   0.1s
[CV 7/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.1s
[CV 8/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.660 total time=   0.1s
[CV 9/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.702 total time=   0.2s
[CV 10/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.2s
[CV 1/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.667 total time=   1.2s
[CV 2/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.660 total time=   1.2s
[CV 4/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.7s
[CV 3/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.574 total time=   1.4s
[CV 5/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.872 total time=   1.3s
[CV 6/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.702 total time=   1.5s
[CV 8/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.617 total time=   0.9s
[CV 7/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.745 total time=   2.4s
[CV 9/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.617 total time=   0.9s
[CV 1/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.646 total time=   0.2s
[CV 2/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.638 total time=   0.2s
[CV 3/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.617 total time=   0.2s
[CV 4/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.2s
[CV 5/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.872 total time=   0.3s
[CV 10/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   1.4s
[CV 6/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.681 total time=   0.3s
[CV 8/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.638 total time=   0.2s
[CV 7/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.3s
[CV 9/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.638 total time=   0.3s
[CV 10/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.3s
[CV] END  accuracy: (test=0.755) f1_macro: (test=0.755) precision: (test=0.769) recall: (test=0.741) roc_auc: (test=0.836) total time=  13.6s
Fitting 10 folds for each of 12 candidates, totalling 120 fits
[CV 2/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.500 total time=   0.0s
[CV 1/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.500 total time=   0.0s
[CV 3/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 5/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 4/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 7/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 6/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 8/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 10/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 9/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 6/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.723 total time=   0.5s
[CV 7/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.4s
[CV 9/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.702 total time=   0.4s
[CV 8/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.8s
[CV 1/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.667 total time=   0.1s
[CV 2/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.708 total time=   0.1s
[CV 3/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.660 total time=   0.1s
[CV 4/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.638 total time=   0.1s
[CV 5/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.851 total time=   0.2s
[CV 6/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.1s
[CV 7/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.2s
[CV 10/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.809 total time=   1.0s
[CV 8/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.1s
[CV 9/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.1s
[CV 10/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.1s
[CV 1/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.667 total time=   0.4s
[CV 3/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.681 total time=   0.4s
[CV 4/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.617 total time=   0.2s
[CV 2/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.708 total time=   1.0s
[CV 5/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.9s
[CV 6/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.723 total time=   1.0s
[CV 7/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.5s
[CV 8/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   1.1s
[CV 9/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.702 total time=   1.1s
[CV 1/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.646 total time=   0.2s
[CV 2/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.708 total time=   0.2s
[CV 3/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.660 total time=   0.1s
[CV 4/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.638 total time=   0.2s
[CV 10/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.745 total time=   1.3s
[CV 5/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.851 total time=   0.2s
[CV 6/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.2s
[CV 7/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.2s
[CV 8/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.2s
[CV 9/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.702 total time=   0.2s
[CV 10/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.1s
[CV] END  accuracy: (test=0.808) f1_macro: (test=0.815) precision: (test=0.786) recall: (test=0.846) roc_auc: (test=0.837) total time=   9.9s
Fitting 10 folds for each of 12 candidates, totalling 120 fits
[CV 1/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.500 total time=   0.0s
[CV 3/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 4/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 2/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.500 total time=   0.0s
[CV 5/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 6/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 8/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 7/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 9/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 10/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 1/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.750 total time=   0.0s
[CV 2/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.771 total time=   0.0s
[CV 4/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 3/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.0s
[CV 5/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 6/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 7/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 8/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 10/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 9/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 2/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.833 total time=   0.0s
[CV 1/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.792 total time=   0.0s
[CV 4/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.702 total time=   0.0s
[CV 3/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.872 total time=   0.0s
[CV 6/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 5/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 8/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 7/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 10/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 9/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.872 total time=   0.0s
[CV 1/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.812 total time=   0.0s
[CV 2/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.854 total time=   0.0s
[CV 3/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 4/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 5/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 6/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 7/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 8/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 9/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 10/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 1/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.771 total time=   0.0s
[CV 2/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.771 total time=   0.0s
[CV 3/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.681 total time=   0.0s
[CV 1/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.750 total time=   0.0s
[CV 4/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 5/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 6/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 7/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 9/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 8/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.0s
[CV 1/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.771 total time=   0.0s
[CV 10/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 2/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.833 total time=   0.0s
[CV 3/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.872 total time=   0.0s
[CV 4/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.702 total time=   0.0s
[CV 5/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 7/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 6/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 8/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 9/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 1/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.833 total time=   0.0s
[CV 10/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 3/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 2/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.854 total time=   0.0s
[CV 5/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 4/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 7/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 6/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.872 total time=   0.0s
[CV 9/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 8/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.0s
[CV 1/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.771 total time=   0.0s
[CV 10/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 2/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.854 total time=   0.0s
[CV 4/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 5/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 3/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 7/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 6/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.872 total time=   0.0s
[CV 8/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 9/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 10/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 1/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.708 total time=   0.0s
[CV 2/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.792 total time=   0.0s
[CV 3/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.0s
[CV 4/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 5/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 6/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.915 total time=   0.0s
[CV 7/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 8/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.681 total time=   0.0s
[CV 9/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 10/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 2/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.688 total time=   0.1s
[CV 1/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.729 total time=   0.1s
[CV 3/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.702 total time=   0.1s
[CV 4/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.723 total time=   0.1s
[CV 6/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.894 total time=   0.1s
[CV 5/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.1s
[CV 8/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.702 total time=   0.1s
[CV 7/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.1s
[CV 10/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   0.1s
[CV 9/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.723 total time=   0.1s
[CV 1/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.708 total time=   0.1s
[CV 2/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.708 total time=   0.1s
[CV 3/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.702 total time=   0.1s
[CV 4/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.702 total time=   0.1s
[CV 5/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.1s
[CV 6/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.894 total time=   0.1s
[CV 7/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.1s
[CV 8/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.1s
[CV 9/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.1s
[CV 10/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.1s
[CV 1/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.667 total time=   0.8s
[CV 2/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.688 total time=   0.8s
[CV 3/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.681 total time=   0.8s
[CV 4/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.617 total time=   0.8s
[CV 5/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.6s
[CV 6/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.872 total time=   0.9s
[CV 7/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.6s
[CV 8/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.638 total time=   0.5s
[CV 9/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.596 total time=   1.0s
[CV 1/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.646 total time=   0.1s
[CV 2/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.646 total time=   0.1s
[CV 10/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   1.0s
[CV 3/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.681 total time=   0.1s
[CV 4/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.617 total time=   0.1s
[CV 5/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.851 total time=   0.1s
[CV 6/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.872 total time=   0.1s
[CV 7/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.1s
[CV 8/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.660 total time=   0.1s
[CV 9/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.596 total time=   0.1s
[CV 10/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.872 total time=   0.2s
[CV 1/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.625 total time=   0.7s
[CV 2/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.688 total time=   1.5s
[CV 3/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.660 total time=   1.4s
[CV 4/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.617 total time=   1.1s
[CV 6/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   1.1s
[CV 5/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   1.7s
[CV 7/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.745 total time=   1.3s
[CV 8/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.681 total time=   1.3s
[CV 9/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.553 total time=   1.9s
[CV 1/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.646 total time=   0.1s
[CV 2/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.667 total time=   0.3s
[CV 10/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   2.3s
[CV 3/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.638 total time=   0.2s
[CV 4/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.574 total time=   0.2s
[CV 5/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.2s
[CV 6/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.872 total time=   0.3s
[CV 7/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.2s
[CV 8/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.660 total time=   0.2s
[CV 9/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.596 total time=   0.3s
[CV 10/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.3s
[CV] END  accuracy: (test=0.769) f1_macro: (test=0.778) precision: (test=0.750) recall: (test=0.808) roc_auc: (test=0.883) total time=  14.3s
Fitting 10 folds for each of 12 candidates, totalling 120 fits
[CV 1/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.500 total time=   0.0s
[CV 3/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 2/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.500 total time=   0.0s
[CV 4/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 6/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 5/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 8/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 7/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 9/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 10/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 1/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.750 total time=   0.0s
[CV 2/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.771 total time=   0.0s
[CV 3/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.681 total time=   0.0s
[CV 4/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 5/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 6/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 8/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 7/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 9/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 10/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 2/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.833 total time=   0.0s
[CV 1/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.771 total time=   0.0s
[CV 3/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 4/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.702 total time=   0.0s
[CV 6/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 5/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 8/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 7/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 9/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.894 total time=   0.0s
[CV 10/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 1/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.771 total time=   0.0s
[CV 2/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.833 total time=   0.0s
[CV 3/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.0s
[CV 4/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 5/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 6/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 7/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 8/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 9/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 10/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 1/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.771 total time=   0.0s
[CV 3/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 2/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.875 total time=   0.0s
[CV 5/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 2/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.875 total time=   0.0s
[CV 5/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 4/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 6/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 7/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 8/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 9/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.872 total time=   0.0s
[CV 10/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 1/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.750 total time=   0.0s
[CV 2/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.812 total time=   0.0s
[CV 3/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.0s
[CV 4/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 5/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 6/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 7/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.702 total time=   0.0s
[CV 8/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.702 total time=   0.0s
[CV 9/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 10/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 2/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.708 total time=   0.1s
[CV 1/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.729 total time=   0.1s
[CV 3/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.745 total time=   0.1s
[CV 4/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 5/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   0.1s
[CV 7/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.681 total time=   0.0s
[CV 6/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   0.1s
[CV 9/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.1s
[CV 8/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.681 total time=   0.1s
[CV 1/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.708 total time=   0.1s
[CV 10/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.1s
[CV 2/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.750 total time=   0.0s
[CV 3/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.702 total time=   0.1s
[CV 4/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.681 total time=   0.0s
[CV 5/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.851 total time=   0.1s
[CV 6/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.894 total time=   0.1s
[CV 7/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.660 total time=   0.1s
[CV 8/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.702 total time=   0.1s
[CV 9/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.1s
[CV 10/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.1s
[CV 2/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.667 total time=   0.5s
[CV 1/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.646 total time=   0.7s
[CV 3/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.660 total time=   0.9s
[CV 4/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.702 total time=   0.6s
[CV 5/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.7s
[CV 6/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   0.8s
[CV 8/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.596 total time=   0.5s
[CV 7/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.660 total time=   0.8s
[CV 10/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.5s
[CV 1/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.688 total time=   0.1s
[CV 9/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.681 total time=   0.8s
[CV 2/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.708 total time=   0.1s
[CV 3/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.1s
[CV 4/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.681 total time=   0.1s
[CV 5/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.872 total time=   0.1s
[CV 6/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.894 total time=   0.1s
[CV 7/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.681 total time=   0.1s
[CV 8/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.638 total time=   0.1s
[CV 9/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.1s
[CV 10/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.1s
[CV 1/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.708 total time=   0.7s
[CV 2/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.625 total time=   1.3s
[CV 3/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.660 total time=   1.5s
[CV 4/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.681 total time=   1.2s
[CV 6/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   0.7s
[CV 5/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   1.3s
[CV 7/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.681 total time=   1.0s
[CV 8/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.553 total time=   1.0s
[CV 9/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.723 total time=   1.1s
[CV 1/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.667 total time=   0.2s
[CV 2/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.646 total time=   0.2s
[CV 3/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.617 total time=   0.2s
[CV 10/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   1.5s
[CV 5/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.2s
[CV 4/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.681 total time=   0.3s
[CV 6/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.872 total time=   0.2s
[CV 7/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.638 total time=   0.2s
[CV 8/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.596 total time=   0.2s
[CV 9/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.2s
[CV 10/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.2s
[CV] END  accuracy: (test=0.769) f1_macro: (test=0.769) precision: (test=0.769) recall: (test=0.769) roc_auc: (test=0.828) total time=  12.2s
Fitting 10 folds for each of 12 candidates, totalling 120 fits
[CV 2/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.500 total time=   0.0s
[CV 1/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.500 total time=   0.0s
[CV 4/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 3/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 6/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 7/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 5/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 9/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 8/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 10/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 1/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.750 total time=   0.0s
[CV 2/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.750 total time=   0.0s
[CV 3/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.681 total time=   0.0s
[CV 4/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 5/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 6/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 7/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.0s
[CV 8/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 10/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 9/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 1/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.771 total time=   0.0s
[CV 3/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.872 total time=   0.0s
[CV 2/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.812 total time=   0.0s
[CV 4/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 5/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 6/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 7/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 9/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 8/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 10/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 1/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.812 total time=   0.0s
[CV 2/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.792 total time=   0.0s
[CV 3/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 4/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 6/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 5/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 7/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 8/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 10/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 9/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 1/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.792 total time=   0.0s
[CV 2/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.854 total time=   0.0s
[CV 3/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 4/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 5/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 6/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.872 total time=   0.0s
[CV 7/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 8/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 9/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 10/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 1/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.729 total time=   0.0s
[CV 2/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.750 total time=   0.0s
[CV 3/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.0s
[CV 4/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 6/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 5/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 7/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.702 total time=   0.0s
[CV 8/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 9/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 10/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 2/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.729 total time=   0.1s
[CV 1/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.708 total time=   0.1s
[CV 3/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.1s
[CV 4/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.723 total time=   0.1s
[CV 5/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.1s
[CV 6/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.872 total time=   0.1s
[CV 7/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.745 total time=   0.1s
[CV 8/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.1s
[CV 9/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.745 total time=   0.1s
[CV 10/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   0.1s
[CV 1/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.729 total time=   0.1s
[CV 2/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.750 total time=   0.1s
[CV 3/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.1s
[CV 4/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.702 total time=   0.1s
[CV 5/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.1s
[CV 6/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.851 total time=   0.1s
[CV 4/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 7/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 8/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 6/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.872 total time=   0.0s
[CV 10/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 9/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 1/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.729 total time=   0.0s
[CV 2/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.833 total time=   0.0s
[CV 3/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.0s
[CV 4/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 6/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 5/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 8/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.0s
[CV 7/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.0s
[CV 9/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 10/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 2/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.750 total time=   0.0s
[CV 1/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.708 total time=   0.1s
[CV 4/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.723 total time=   0.1s
[CV 3/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.681 total time=   0.1s
[CV 5/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 6/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.872 total time=   0.1s
[CV 7/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.745 total time=   0.1s
[CV 8/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 9/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.1s
[CV 10/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.745 total time=   0.1s
[CV 1/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.708 total time=   0.1s
[CV 2/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.771 total time=   0.1s
[CV 3/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.660 total time=   0.1s
[CV 4/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.702 total time=   0.1s
[CV 5/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.1s
[CV 6/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.957 total time=   0.1s
[CV 7/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.1s
[CV 8/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 10/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.1s
[CV 9/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.1s
[CV 1/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.646 total time=   0.4s
[CV 2/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.604 total time=   0.5s
[CV 3/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.574 total time=   0.3s
[CV 4/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.660 total time=   0.8s
[CV 5/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.745 total time=   0.7s
[CV 7/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.7s
[CV 6/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.894 total time=   0.8s
[CV 8/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.723 total time=   0.8s
[CV 9/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.638 total time=   0.9s
[CV 1/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.688 total time=   0.1s
[CV 2/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.625 total time=   0.1s
[CV 3/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.617 total time=   0.1s
[CV 4/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.660 total time=   0.2s
[CV 10/10] END model__C=10, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.8s
[CV 5/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.1s
[CV 7/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.1s
[CV 6/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.894 total time=   0.1s
[CV 9/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.660 total time=   0.1s
[CV 8/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.1s
[CV 10/10] END model__C=10, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.1s
[CV 1/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.667 total time=   0.5s
[CV 3/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.596 total time=   0.4s
[CV 2/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.604 total time=   1.3s
[CV 4/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.638 total time=   0.6s
[CV 6/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.894 total time=   0.6s
[CV 5/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.745 total time=   1.4s
[CV 7/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.723 total time=   1.8s
[CV 8/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.723 total time=   1.2s
[CV 9/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.681 total time=   1.0s
[CV 1/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.667 total time=   0.2s
[CV 2/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.583 total time=   0.2s
[CV 3/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.617 total time=   0.2s
[CV 4/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.660 total time=   0.2s
[CV 10/10] END model__C=100, model__penalty=l1, model__solver=liblinear;, score=0.809 total time=   1.8s
[CV 5/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.3s
[CV 6/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.894 total time=   0.2s
[CV 7/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.2s
[CV 8/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.702 total time=   0.2s
[CV 9/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.660 total time=   0.2s
[CV 10/10] END model__C=100, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.2s
[CV] END  accuracy: (test=0.769) f1_macro: (test=0.760) precision: (test=0.792) recall: (test=0.731) roc_auc: (test=0.902) total time=  11.7s
Fitting 10 folds for each of 12 candidates, totalling 120 fits
[CV 1/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.500 total time=   0.0s
[CV 3/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 2/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.500 total time=   0.0s
[CV 4/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 6/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 5/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.511 total time=   0.0s
[CV 7/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 9/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 8/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 10/10] END model__C=0.001, model__penalty=l1, model__solver=liblinear;, score=0.489 total time=   0.0s
[CV 1/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.750 total time=   0.0s
[CV 2/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.750 total time=   0.0s
[CV 3/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.702 total time=   0.0s
[CV 4/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 5/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.0s
[CV 6/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 7/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 8/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 9/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 10/10] END model__C=0.001, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 1/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.771 total time=   0.0s
[CV 2/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.833 total time=   0.0s
[CV 3/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.872 total time=   0.0s
[CV 4/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.660 total time=   0.0s
[CV 5/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 6/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 7/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 8/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 10/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 9/10] END model__C=0.01, model__penalty=l1, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 1/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.812 total time=   0.0s
[CV 2/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.833 total time=   0.0s
[CV 3/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 4/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 6/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 5/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 8/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.0s
[CV 7/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.0s
[CV 9/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 10/10] END model__C=0.01, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 1/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.771 total time=   0.0s
[CV 2/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.854 total time=   0.0s
[CV 3/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 4/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 6/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 5/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.851 total time=   0.0s
[CV 8/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 7/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 10/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.872 total time=   0.0s
[CV 9/10] END model__C=0.1, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.0s
[CV 1/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.771 total time=   0.0s
[CV 2/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.771 total time=   0.0s
[CV 3/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.0s
[CV 4/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 6/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 5/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.830 total time=   0.0s
[CV 7/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 8/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 10/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.809 total time=   0.0s
[CV 9/10] END model__C=0.1, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.0s
[CV 1/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.771 total time=   0.1s
[CV 2/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.750 total time=   0.1s
[CV 3/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.723 total time=   0.1s
[CV 4/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.766 total time=   0.1s
[CV 6/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.830 total time=   0.1s
[CV 5/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.1s
[CV 8/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.723 total time=   0.0s
[CV 7/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.745 total time=   0.0s
[CV 9/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.702 total time=   0.1s
[CV 10/10] END model__C=1, model__penalty=l1, model__solver=liblinear;, score=0.787 total time=   0.1s
[CV 2/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.708 total time=   0.1s
[CV 1/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.750 total time=   0.1s
[CV 4/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.681 total time=   0.1s
[CV 3/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.723 total time=   0.1s
[CV 5/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.787 total time=   0.1s
[CV 6/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.851 total time=   0.1s
[CV 8/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.660 total time=   0.1s
[CV 7/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.745 total time=   0.1s
[CV 9/10] END model__C=1, model__penalty=l2, model__solver=liblinear;, score=0.660 total time=   0.1s
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.0min finished
{'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
gs_visualizer = Visualizer(gs_trainer, "Logistic Regression (Grid Search)")
gs_visualizer.plot_roc_curve_eval(show_folds=True)
Preparation of a new explainer is initiated

  -> data              : 132 rows 202 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 132 values
  -> model_class       : sklearn.model_selection._search.GridSearchCV (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_proba_default at 0x7fc1d01214e0> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 0.0099, mean = 0.481, max = 0.996
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.958, mean = 0.0195, max = 0.796
  -> model_info        : package sklearn

A new explainer has been created!

gs_trainer.eval_test()
gs_visualizer.plot_test_metrics()

14 Model Explanation & Reduction

reduced_best_model_trainer = (
    Trainer(
        all_cols_data_module,
        LogisticRegression(**gs_trainer.get_best_params()),
        select_features=True,
    )
    .fit()
    .eval_train()
)

reduced_best_model_trainer.get_selected_features()
['M_2_balance', 'M_3_balance', 'M_4_balance', 'M_6_balance', 'M_13_balance']
reduced_best_model_visualizer = Visualizer(
    reduced_best_model_trainer, "Reduced Logistic Regression"
)

Visualizer.compare_evaluation_metrics(
    [best_model_visualizer, reduced_best_model_visualizer, gs_visualizer]
)
Preparation of a new explainer is initiated

  -> data              : 132 rows 202 cols
  -> target variable   : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray.
  -> target variable   : 132 values
  -> model_class       : sklearn.pipeline.Pipeline (default)
  -> label             : Not specified, model's class short name will be used. (default)
  -> predict function  : <function yhat_proba_default at 0x7fc1d01214e0> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 0.00803, mean = 0.47, max = 0.996
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.95, mean = 0.0302, max = 0.835
  -> model_info        : package sklearn

A new explainer has been created!

14.1 Lift Curve

reduced_best_model_visualizer.plot_lift_curve_test()

15 Conclusion